From 20afaae0209172b02d8df0a33e9391e0a18f33d6 Mon Sep 17 00:00:00 2001 From: jmorganca Date: Tue, 28 May 2024 00:02:01 -0700 Subject: [PATCH] add more runner params --- llama/llama.go | 30 +++++++++++++++++++++++++----- llama/runner/runner.go | 38 +++++++++++++++++++++++++++----------- 2 files changed, 52 insertions(+), 16 deletions(-) diff --git a/llama/llama.go b/llama/llama.go index 76c458f1..9b75e388 100644 --- a/llama/llama.go +++ b/llama/llama.go @@ -31,6 +31,7 @@ package llama // #include "sampling_ext.h" import "C" import ( + "errors" "fmt" "runtime" "strings" @@ -49,13 +50,14 @@ type ContextParams struct { c C.struct_llama_context_params } -func NewContextParams() ContextParams { +func NewContextParams(numCtx int, threads int, flashAttention bool) ContextParams { params := C.llama_context_default_params() - params.seed = C.uint(1234) - params.n_ctx = C.uint(2048) + params.n_ctx = C.uint(numCtx) params.n_threads = C.uint(runtime.NumCPU()) params.n_threads_batch = params.n_threads params.embeddings = C.bool(true) + params.flash_attn = C.bool(flashAttention) + params.n_threads = C.uint(threads) return ContextParams{c: params} } @@ -63,9 +65,10 @@ type ModelParams struct { c C.struct_llama_model_params } -func NewModelParams() ModelParams { +func NewModelParams(numGpuLayers int, mainGpu int) ModelParams { params := C.llama_model_default_params() - params.n_gpu_layers = 999 + params.n_gpu_layers = C.int(numGpuLayers) + params.main_gpu = C.int32_t(mainGpu) return ModelParams{c: params} } @@ -155,6 +158,23 @@ func (m *Model) TokenIsEog(token int) bool { return bool(C.llama_token_is_eog(m.c, C.llama_token(token))) } +func (m *Model) ApplyLoraFromFile(loraPath string, scale float32, baseModelPath string, threads int) error { + cLoraPath := C.CString(loraPath) + defer C.free(unsafe.Pointer(cLoraPath)) + + var cBaseModelPath *C.char + if baseModelPath != "" { + cBaseModelPath = C.CString(baseModelPath) + } + + code := int(C.llama_model_apply_lora_from_file(m.c, cLoraPath, C.float(scale), cBaseModelPath, C.int32_t(threads))) + if code != 0 { + return errors.New("error applying lora from file") + } + + return nil +} + type Batch struct { c C.struct_llama_batch } diff --git a/llama/runner/runner.go b/llama/runner/runner.go index 54210a49..f029473b 100644 --- a/llama/runner/runner.go +++ b/llama/runner/runner.go @@ -9,6 +9,7 @@ import ( "log/slog" "net" "net/http" + "runtime" "strconv" "strings" "sync" @@ -73,6 +74,8 @@ type Server struct { lc *llama.Context cc *llama.ClipContext + batchSize int + // parallel is the number of parallel requests to handle parallel int @@ -154,7 +157,7 @@ func truncateStop(pieces []string, stop string) []string { } func (s *Server) run(ctx context.Context) { - batch := llama.NewBatch(512, 0, s.parallel) + batch := llama.NewBatch(s.batchSize, 0, s.parallel) defer batch.Free() // build up stop sequences as we recognize them @@ -182,7 +185,7 @@ func (s *Server) run(ctx context.Context) { for j, t := range seq.tokens { // todo: make this n_batch - if j > 512 { + if j > s.batchSize { break } @@ -207,10 +210,10 @@ func (s *Server) run(ctx context.Context) { // don't sample prompt processing if seq.prompt() { - if len(seq.tokens) < 512 { + if len(seq.tokens) < s.batchSize { seq.tokens = []int{} } else { - seq.tokens = seq.tokens[512:] + seq.tokens = seq.tokens[s.batchSize:] } continue @@ -412,14 +415,26 @@ func main() { mpath := flag.String("model", "", "Path to model binary file") ppath := flag.String("projector", "", "Path to projector binary file") parallel := flag.Int("parallel", 1, "Number of sequences to handle simultaneously") + batchSize := flag.Int("batch-size", 512, "Batch size") + nGpuLayers := flag.Int("n-gpu-layers", 0, "Number of layers to offload to GPU") + mainGpu := flag.Int("main-gpu", 0, "Main GPU") + flashAttention := flag.Bool("flash-attention", false, "Enable flash attention") + numCtx := flag.Int("num-ctx", 2048, "Context (or KV cache) size") + lpath := flag.String("lora", "", "Path to lora layer file") port := flag.Int("port", 8080, "Port to expose the server on") + threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation") flag.Parse() // load the model llama.BackendInit() - params := llama.NewModelParams() + params := llama.NewModelParams(*nGpuLayers, *mainGpu) model := llama.LoadModelFromFile(*mpath, params) - ctxParams := llama.NewContextParams() + + if *lpath != "" { + model.ApplyLoraFromFile(*lpath, 1.0, "", *threads) + } + + ctxParams := llama.NewContextParams(*numCtx, *threads, *flashAttention) lc := llama.NewContextWithModel(model, ctxParams) if lc == nil { panic("Failed to create context") @@ -434,11 +449,12 @@ func main() { } server := &Server{ - model: model, - lc: lc, - cc: cc, - parallel: *parallel, - seqs: make([]*Sequence, *parallel), + model: model, + lc: lc, + cc: cc, + batchSize: *batchSize, + parallel: *parallel, + seqs: make([]*Sequence, *parallel), } server.cond = sync.NewCond(&server.mu)