From 55fb0633db706249b0e5326d89623b605b3e9f4b Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Fri, 23 Aug 2024 17:27:09 -0700
Subject: [PATCH] runner.go: Separate KV cache and context sizes

Currently the entire KV cache is shared by all parallel requestors.
This gives maximum resource utilization but there is a potential for
overflow and unfairness if multiple requests are trying to use
significant context. Instead, it is better to have a hard partition
of KV cache space.
---
 llama/runner/runner.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama/runner/runner.go b/llama/runner/runner.go
index 52087276..77d7bdee 100644
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -594,7 +594,7 @@ func main() {
 	nGpuLayers := flag.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
 	mainGpu := flag.Int("main-gpu", 0, "Main GPU")
 	flashAttention := flag.Bool("flash-attn", false, "Enable flash attention")
-	numCtx := flag.Int("ctx-size", 2048, "Context (or KV cache) size")
+	kvSize := flag.Int("ctx-size", 2048, "Context (or KV cache) size")
 	lpath := flag.String("lora", "", "Path to lora layer file")
 	port := flag.Int("port", 8080, "Port to expose the server on")
 	threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
@@ -647,7 +647,7 @@ func main() {
 	}
 
 	server := &Server{
-		numCtx:    *numCtx,
+		numCtx:    *kvSize / *parallel,
 		batchSize: *batchSize,
 		parallel:  *parallel,
 		seqs:      make([]*Sequence, *parallel),
@@ -669,7 +669,7 @@ func main() {
 		}
 	}
 
-	ctxParams := llama.NewContextParams(*numCtx, *threads, *flashAttention)
+	ctxParams := llama.NewContextParams(*kvSize, *threads, *flashAttention)
 	server.lc = llama.NewContextWithModel(server.model, ctxParams)
 
 	if server.model.ShouldAddBOSToken() {