runner.go: Support resource usage command line options
Command line options to the runner that control resource usage (mmap, mlock, tensor split) are used by Ollama but not currently implemented. This implements support for these while ignoring others that have no meaning in this context.
This commit is contained in:
parent
fd4ecd1ff5
commit
e4a091bafd
@ -29,9 +29,14 @@ func main() {
|
||||
|
||||
// load the model
|
||||
llama.BackendInit()
|
||||
params := llama.NewModelParams(999, 0, func(p float32) {
|
||||
fmt.Printf("loading... %f\n", p)
|
||||
})
|
||||
params := llama.ModelParams{
|
||||
NumGpuLayers: 999,
|
||||
MainGpu: 0,
|
||||
UseMmap: true,
|
||||
Progress: func(p float32) {
|
||||
fmt.Printf("loading... %f\n", p)
|
||||
},
|
||||
}
|
||||
model := llama.LoadModelFromFile(*mpath, params)
|
||||
ctxParams := llama.NewContextParams(2048, runtime.NumCPU(), false)
|
||||
|
||||
|
@ -78,33 +78,6 @@ func NewContextParams(numCtx int, threads int, flashAttention bool) ContextParam
|
||||
return ContextParams{c: params}
|
||||
}
|
||||
|
||||
type ModelParams struct {
|
||||
c C.struct_llama_model_params
|
||||
}
|
||||
|
||||
//export llamaProgressCallback
|
||||
func llamaProgressCallback(progress C.float, userData unsafe.Pointer) C.bool {
|
||||
handle := cgo.Handle(userData)
|
||||
callback := handle.Value().(func(float32))
|
||||
callback(float32(progress))
|
||||
return true
|
||||
}
|
||||
|
||||
func NewModelParams(numGpuLayers int, mainGpu int, callback func(float32)) ModelParams {
|
||||
params := C.llama_model_default_params()
|
||||
params.n_gpu_layers = C.int(numGpuLayers)
|
||||
params.main_gpu = C.int32_t(mainGpu)
|
||||
|
||||
handle := cgo.NewHandle(callback)
|
||||
params.progress_callback = C.llama_progress_callback(C.llamaProgressCallback)
|
||||
params.progress_callback_user_data = unsafe.Pointer(handle)
|
||||
runtime.SetFinalizer(¶ms, func(p *C.struct_llama_model_params) {
|
||||
handle.Delete()
|
||||
})
|
||||
|
||||
return ModelParams{c: params}
|
||||
}
|
||||
|
||||
type Context struct {
|
||||
c *C.struct_llama_context
|
||||
}
|
||||
@ -179,8 +152,49 @@ func (c *Context) GetEmbeddingsIth(i int) []float32 {
|
||||
return unsafe.Slice((*float32)(unsafe.Pointer(C.llama_get_embeddings_ith(c.c, C.int32_t(i)))), c.Model().NEmbd())
|
||||
}
|
||||
|
||||
type ModelParams struct {
|
||||
NumGpuLayers int
|
||||
MainGpu int
|
||||
UseMmap bool
|
||||
UseMlock bool
|
||||
TensorSplit []float32
|
||||
Progress func(float32)
|
||||
}
|
||||
|
||||
//export llamaProgressCallback
|
||||
func llamaProgressCallback(progress C.float, userData unsafe.Pointer) C.bool {
|
||||
handle := cgo.Handle(userData)
|
||||
callback := handle.Value().(func(float32))
|
||||
callback(float32(progress))
|
||||
return true
|
||||
}
|
||||
|
||||
func LoadModelFromFile(modelPath string, params ModelParams) *Model {
|
||||
return &Model{c: C.llama_load_model_from_file(C.CString(modelPath), params.c)}
|
||||
cparams := C.llama_model_default_params()
|
||||
cparams.n_gpu_layers = C.int(params.NumGpuLayers)
|
||||
cparams.main_gpu = C.int32_t(params.MainGpu)
|
||||
cparams.use_mmap = C.bool(params.UseMmap)
|
||||
cparams.use_mlock = C.bool(params.UseMlock)
|
||||
|
||||
if len(params.TensorSplit) > 0 {
|
||||
tensorSplitData := ¶ms.TensorSplit[0]
|
||||
|
||||
var tensorSplitPin runtime.Pinner
|
||||
tensorSplitPin.Pin(tensorSplitData)
|
||||
defer tensorSplitPin.Unpin()
|
||||
|
||||
cparams.tensor_split = (*C.float)(unsafe.Pointer(tensorSplitData))
|
||||
}
|
||||
|
||||
if params.Progress != nil {
|
||||
handle := cgo.NewHandle(params.Progress)
|
||||
defer handle.Delete()
|
||||
|
||||
cparams.progress_callback = C.llama_progress_callback(C.llamaProgressCallback)
|
||||
cparams.progress_callback_user_data = unsafe.Pointer(handle)
|
||||
}
|
||||
|
||||
return &Model{c: C.llama_load_model_from_file(C.CString(modelPath), cparams)}
|
||||
}
|
||||
|
||||
func NewContextWithModel(model *Model, params ContextParams) *Context {
|
||||
|
@ -12,6 +12,7 @@ import (
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
@ -599,16 +600,16 @@ func main() {
|
||||
lpath := flag.String("lora", "", "Path to lora layer file")
|
||||
port := flag.Int("port", 8080, "Port to expose the server on")
|
||||
threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
|
||||
|
||||
// TODO not yet implemented but wired to keep the parsing aligned
|
||||
embedding := flag.Bool("embedding", false, "enable embedding vector output (default: disabled)")
|
||||
logDisable := flag.Bool("log-disable", false, "disables logging to a file")
|
||||
verbose := flag.Bool("verbose", false, "verbose output (default: disabled)")
|
||||
f32 := flag.Bool("memory-f32", false, "use f32 instead of f16 for memory key+value (default: disabled) not recommended: doubles context memory required and no measurable increase in quality")
|
||||
noMmap := flag.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
|
||||
mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
|
||||
tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
|
||||
|
||||
// These are either ignored by llama.cpp or have no significance to us
|
||||
_ = flag.Bool("embedding", false, "enable embedding vector output (default: disabled)")
|
||||
_ = flag.Bool("log-disable", false, "disables logging to a file")
|
||||
_ = flag.Bool("memory-f32", false, "use f32 instead of f16 for memory key+value (default: disabled) not recommended: doubles context memory required and no measurable increase in quality")
|
||||
|
||||
flag.Parse()
|
||||
level := slog.LevelInfo
|
||||
if *verbose {
|
||||
@ -627,26 +628,6 @@ func main() {
|
||||
})
|
||||
slog.SetDefault(slog.New(handler))
|
||||
|
||||
// TODO actually implement...
|
||||
if *embedding {
|
||||
slog.Warn("embeddings not yet supported")
|
||||
}
|
||||
if *logDisable {
|
||||
slog.Info("ignoring --log-disable")
|
||||
}
|
||||
if *f32 {
|
||||
slog.Warn("memory-f32 not yet supported")
|
||||
}
|
||||
if *noMmap {
|
||||
slog.Warn("no-mmap not yet supported")
|
||||
}
|
||||
if *mlock {
|
||||
slog.Warn("mlock not yet supported")
|
||||
}
|
||||
if *tensorSplit != "" {
|
||||
slog.Warn("tensor-split not yet implemented")
|
||||
}
|
||||
|
||||
server := &Server{
|
||||
numCtx: *kvSize / *parallel,
|
||||
batchSize: *batchSize,
|
||||
@ -659,10 +640,29 @@ func main() {
|
||||
// otherwise Ollama can timeout for large model loads
|
||||
// load the model
|
||||
llama.BackendInit()
|
||||
params := llama.NewModelParams(*nGpuLayers, *mainGpu, func(progress float32) {
|
||||
slog.Debug("Loading model", "progress %", math.Round(float64(progress*100)))
|
||||
server.progress = progress
|
||||
})
|
||||
|
||||
var tensorSplitFloats []float32
|
||||
if *tensorSplit != "" {
|
||||
stringFloats := regexp.MustCompile(",").Split(*tensorSplit, -1)
|
||||
|
||||
tensorSplitFloats = make([]float32, 0, len(stringFloats))
|
||||
for _, s := range stringFloats {
|
||||
f, _ := strconv.ParseFloat(s, 32)
|
||||
tensorSplitFloats = append(tensorSplitFloats, float32(f))
|
||||
}
|
||||
}
|
||||
|
||||
params := llama.ModelParams{
|
||||
NumGpuLayers: *nGpuLayers,
|
||||
MainGpu: *mainGpu,
|
||||
UseMmap: !*noMmap && *lpath == "",
|
||||
UseMlock: *mlock,
|
||||
TensorSplit: tensorSplitFloats,
|
||||
Progress: func(progress float32) {
|
||||
slog.Debug("Loading model", "progress %", math.Round(float64(progress*100)))
|
||||
server.progress = progress
|
||||
},
|
||||
}
|
||||
server.model = llama.LoadModelFromFile(*mpath, params)
|
||||
|
||||
if *lpath != "" {
|
||||
|
Loading…
x
Reference in New Issue
Block a user