diff --git a/llama/runner/runner.go b/llama/runner/runner.go index 07fa5f06..29d59432 100644 --- a/llama/runner/runner.go +++ b/llama/runner/runner.go @@ -198,8 +198,7 @@ func incompleteUnicode(token string) bool { } func (s *Server) run(ctx context.Context) { - // TODO - should this be n_ctx / parallel like the old server.cpp setup? - batch := llama.NewBatch(s.batchSize, 0, s.parallel) + batch := llama.NewBatch(s.batchSize*len(s.seqs), 0, len(s.seqs)) defer batch.Free() // build up stop sequences as we recognize them