From 8e1554c91dbf681489230a5ff1d29e16db126ac9 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Fri, 23 Aug 2024 13:44:30 -0700 Subject: [PATCH] runner.go: Scale batches to be processed by numParallel We should process a batch of tokens for each parallel request, rather than having a shared pool. Otherwise, a single request can fill the batch and then subsequent ones will fail or get starved. Server.cpp used the KV cache size allocated for each parallel request as the allocated size for the batch. This is the upper bound for the batch but since we know how many tokens we will actually put in a batch there is no need to over allocate. --- llama/runner/runner.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llama/runner/runner.go b/llama/runner/runner.go index 07fa5f06..29d59432 100644 --- a/llama/runner/runner.go +++ b/llama/runner/runner.go @@ -198,8 +198,7 @@ func incompleteUnicode(token string) bool { } func (s *Server) run(ctx context.Context) { - // TODO - should this be n_ctx / parallel like the old server.cpp setup? - batch := llama.NewBatch(s.batchSize, 0, s.parallel) + batch := llama.NewBatch(s.batchSize*len(s.seqs), 0, len(s.seqs)) defer batch.Free() // build up stop sequences as we recognize them