diff --git a/llm/ext_server.go b/llm/ext_server.go index bd026043..ded424a9 100644 --- a/llm/ext_server.go +++ b/llm/ext_server.go @@ -160,7 +160,7 @@ func newExtServer(server extServer, model string, adapters, projectors []string, sparams.n_batch = C.uint(opts.NumBatch) sparams.n_gpu_layers = C.int(numGPU) sparams.main_gpu = C.int(opts.MainGPU) - sparams.n_parallel = 2 // TODO - wire up concurrency + sparams.n_parallel = 1 // TODO - wire up concurrency // Always use the value encoded in the model sparams.rope_freq_base = 0.0 diff --git a/llm/llama.cpp/gguf b/llm/llama.cpp/gguf index a7aee47b..328b83de 160000 --- a/llm/llama.cpp/gguf +++ b/llm/llama.cpp/gguf @@ -1 +1 @@ -Subproject commit a7aee47b98e45539d491071b25778b833b77e387 +Subproject commit 328b83de23b33240e28f4e74900d1d06726f5eb1 diff --git a/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch b/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch index 623243d4..2e5a981e 100644 --- a/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch +++ b/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch @@ -1,4 +1,4 @@ -From 087cf3300e973d7790db8f7cad01d2a790de38be Mon Sep 17 00:00:00 2001 +From b5e195803e2a989e57eef0010adce778df1e2d01 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 13 Nov 2023 12:25:58 -0800 Subject: [PATCH] Expose callable API for server @@ -46,7 +46,7 @@ index 859cd12..4ea47a7 100644 +endif() \ No newline at end of file diff --git a/examples/server/server.cpp b/examples/server/server.cpp -index d0cd8e1..5f5d4c5 100644 +index 0403853..2084fd8 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -5,6 +5,9 @@ @@ -59,15 +59,15 @@ index d0cd8e1..5f5d4c5 100644 #ifndef NDEBUG // crash the server in debug mode, otherwise send an http 500 error -@@ -2632,6 +2635,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con +@@ -2643,6 +2646,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con } } +#ifndef LLAMA_SERVER_LIBRARY int main(int argc, char **argv) { - // own arguments required by this example -@@ -3066,3 +3070,273 @@ int main(int argc, char **argv) + #if SERVER_VERBOSE != 1 +@@ -3123,3 +3127,273 @@ int main(int argc, char **argv) llama_backend_free(); return 0; } @@ -439,10 +439,10 @@ index 0000000..d22f1b6 +#endif // LLAMA_SERVER_LIBRARY \ No newline at end of file diff --git a/ggml-cuda.cu b/ggml-cuda.cu -index 9e1acd3..ea64b55 100644 +index f20846f..9640cf3 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu -@@ -6505,6 +6505,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( +@@ -6757,6 +6757,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( CUDA_CHECK(cudaGetDevice(&id)); src_ptr = (char *) extra->data_device[id]; } else {