From 55fb0633db706249b0e5326d89623b605b3e9f4b Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Fri, 23 Aug 2024 17:27:09 -0700 Subject: [PATCH] runner.go: Separate KV cache and context sizes Currently the entire KV cache is shared by all parallel requestors. This gives maximum resource utilization but there is a potential for overflow and unfairness if multiple requests are trying to use significant context. Instead, it is better to have a hard partition of KV cache space. --- llama/runner/runner.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama/runner/runner.go b/llama/runner/runner.go index 52087276..77d7bdee 100644 --- a/llama/runner/runner.go +++ b/llama/runner/runner.go @@ -594,7 +594,7 @@ func main() { nGpuLayers := flag.Int("n-gpu-layers", 0, "Number of layers to offload to GPU") mainGpu := flag.Int("main-gpu", 0, "Main GPU") flashAttention := flag.Bool("flash-attn", false, "Enable flash attention") - numCtx := flag.Int("ctx-size", 2048, "Context (or KV cache) size") + kvSize := flag.Int("ctx-size", 2048, "Context (or KV cache) size") lpath := flag.String("lora", "", "Path to lora layer file") port := flag.Int("port", 8080, "Port to expose the server on") threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation") @@ -647,7 +647,7 @@ func main() { } server := &Server{ - numCtx: *numCtx, + numCtx: *kvSize / *parallel, batchSize: *batchSize, parallel: *parallel, seqs: make([]*Sequence, *parallel), @@ -669,7 +669,7 @@ func main() { } } - ctxParams := llama.NewContextParams(*numCtx, *threads, *flashAttention) + ctxParams := llama.NewContextParams(*kvSize, *threads, *flashAttention) server.lc = llama.NewContextWithModel(server.model, ctxParams) if server.model.ShouldAddBOSToken() {