runner.go: Support resource usage command line options

Command line options to the runner that control resource usage
(mmap, mlock, tensor split) are used by Ollama but not currently
implemented. This implements support for these while ignoring
others that have no meaning in this context.
This commit is contained in:
Jesse Gross 2024-08-28 09:29:09 -07:00 committed by jmorganca
parent fd4ecd1ff5
commit e4a091bafd
3 changed files with 79 additions and 60 deletions

View File

@ -29,9 +29,14 @@ func main() {
// load the model
llama.BackendInit()
params := llama.NewModelParams(999, 0, func(p float32) {
fmt.Printf("loading... %f\n", p)
})
params := llama.ModelParams{
NumGpuLayers: 999,
MainGpu: 0,
UseMmap: true,
Progress: func(p float32) {
fmt.Printf("loading... %f\n", p)
},
}
model := llama.LoadModelFromFile(*mpath, params)
ctxParams := llama.NewContextParams(2048, runtime.NumCPU(), false)

View File

@ -78,33 +78,6 @@ func NewContextParams(numCtx int, threads int, flashAttention bool) ContextParam
return ContextParams{c: params}
}
type ModelParams struct {
c C.struct_llama_model_params
}
//export llamaProgressCallback
func llamaProgressCallback(progress C.float, userData unsafe.Pointer) C.bool {
handle := cgo.Handle(userData)
callback := handle.Value().(func(float32))
callback(float32(progress))
return true
}
func NewModelParams(numGpuLayers int, mainGpu int, callback func(float32)) ModelParams {
params := C.llama_model_default_params()
params.n_gpu_layers = C.int(numGpuLayers)
params.main_gpu = C.int32_t(mainGpu)
handle := cgo.NewHandle(callback)
params.progress_callback = C.llama_progress_callback(C.llamaProgressCallback)
params.progress_callback_user_data = unsafe.Pointer(handle)
runtime.SetFinalizer(&params, func(p *C.struct_llama_model_params) {
handle.Delete()
})
return ModelParams{c: params}
}
type Context struct {
c *C.struct_llama_context
}
@ -179,8 +152,49 @@ func (c *Context) GetEmbeddingsIth(i int) []float32 {
return unsafe.Slice((*float32)(unsafe.Pointer(C.llama_get_embeddings_ith(c.c, C.int32_t(i)))), c.Model().NEmbd())
}
type ModelParams struct {
NumGpuLayers int
MainGpu int
UseMmap bool
UseMlock bool
TensorSplit []float32
Progress func(float32)
}
//export llamaProgressCallback
func llamaProgressCallback(progress C.float, userData unsafe.Pointer) C.bool {
handle := cgo.Handle(userData)
callback := handle.Value().(func(float32))
callback(float32(progress))
return true
}
func LoadModelFromFile(modelPath string, params ModelParams) *Model {
return &Model{c: C.llama_load_model_from_file(C.CString(modelPath), params.c)}
cparams := C.llama_model_default_params()
cparams.n_gpu_layers = C.int(params.NumGpuLayers)
cparams.main_gpu = C.int32_t(params.MainGpu)
cparams.use_mmap = C.bool(params.UseMmap)
cparams.use_mlock = C.bool(params.UseMlock)
if len(params.TensorSplit) > 0 {
tensorSplitData := &params.TensorSplit[0]
var tensorSplitPin runtime.Pinner
tensorSplitPin.Pin(tensorSplitData)
defer tensorSplitPin.Unpin()
cparams.tensor_split = (*C.float)(unsafe.Pointer(tensorSplitData))
}
if params.Progress != nil {
handle := cgo.NewHandle(params.Progress)
defer handle.Delete()
cparams.progress_callback = C.llama_progress_callback(C.llamaProgressCallback)
cparams.progress_callback_user_data = unsafe.Pointer(handle)
}
return &Model{c: C.llama_load_model_from_file(C.CString(modelPath), cparams)}
}
func NewContextWithModel(model *Model, params ContextParams) *Context {

View File

@ -12,6 +12,7 @@ import (
"net/http"
"os"
"path/filepath"
"regexp"
"runtime"
"strconv"
"strings"
@ -599,16 +600,16 @@ func main() {
lpath := flag.String("lora", "", "Path to lora layer file")
port := flag.Int("port", 8080, "Port to expose the server on")
threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
// TODO not yet implemented but wired to keep the parsing aligned
embedding := flag.Bool("embedding", false, "enable embedding vector output (default: disabled)")
logDisable := flag.Bool("log-disable", false, "disables logging to a file")
verbose := flag.Bool("verbose", false, "verbose output (default: disabled)")
f32 := flag.Bool("memory-f32", false, "use f32 instead of f16 for memory key+value (default: disabled) not recommended: doubles context memory required and no measurable increase in quality")
noMmap := flag.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
// These are either ignored by llama.cpp or have no significance to us
_ = flag.Bool("embedding", false, "enable embedding vector output (default: disabled)")
_ = flag.Bool("log-disable", false, "disables logging to a file")
_ = flag.Bool("memory-f32", false, "use f32 instead of f16 for memory key+value (default: disabled) not recommended: doubles context memory required and no measurable increase in quality")
flag.Parse()
level := slog.LevelInfo
if *verbose {
@ -627,26 +628,6 @@ func main() {
})
slog.SetDefault(slog.New(handler))
// TODO actually implement...
if *embedding {
slog.Warn("embeddings not yet supported")
}
if *logDisable {
slog.Info("ignoring --log-disable")
}
if *f32 {
slog.Warn("memory-f32 not yet supported")
}
if *noMmap {
slog.Warn("no-mmap not yet supported")
}
if *mlock {
slog.Warn("mlock not yet supported")
}
if *tensorSplit != "" {
slog.Warn("tensor-split not yet implemented")
}
server := &Server{
numCtx: *kvSize / *parallel,
batchSize: *batchSize,
@ -659,10 +640,29 @@ func main() {
// otherwise Ollama can timeout for large model loads
// load the model
llama.BackendInit()
params := llama.NewModelParams(*nGpuLayers, *mainGpu, func(progress float32) {
slog.Debug("Loading model", "progress %", math.Round(float64(progress*100)))
server.progress = progress
})
var tensorSplitFloats []float32
if *tensorSplit != "" {
stringFloats := regexp.MustCompile(",").Split(*tensorSplit, -1)
tensorSplitFloats = make([]float32, 0, len(stringFloats))
for _, s := range stringFloats {
f, _ := strconv.ParseFloat(s, 32)
tensorSplitFloats = append(tensorSplitFloats, float32(f))
}
}
params := llama.ModelParams{
NumGpuLayers: *nGpuLayers,
MainGpu: *mainGpu,
UseMmap: !*noMmap && *lpath == "",
UseMlock: *mlock,
TensorSplit: tensorSplitFloats,
Progress: func(progress float32) {
slog.Debug("Loading model", "progress %", math.Round(float64(progress*100)))
server.progress = progress
},
}
server.model = llama.LoadModelFromFile(*mpath, params)
if *lpath != "" {