Fix CPU performance on hyperthreaded systems

The default thread count logic was broken and resulted in 2x the number
of threads as it should on a hyperthreading CPU
resulting in thrashing and poor performance.
This commit is contained in:
Daniel Hiltgen 2023-12-21 16:23:36 -08:00
parent d9cd3d9667
commit 325d74985b
2 changed files with 9 additions and 12 deletions

View File

@ -37,7 +37,6 @@ import (
"fmt" "fmt"
"log" "log"
"os" "os"
"runtime"
"strings" "strings"
"sync" "sync"
"time" "time"
@ -185,11 +184,7 @@ func newExtServer(server extServer, model string, adapters, projectors []string,
sparams.mmproj = nil sparams.mmproj = nil
} }
if opts.NumThread > 0 { sparams.n_threads = C.uint(opts.NumThread)
sparams.n_threads = C.uint(opts.NumThread)
} else {
sparams.n_threads = C.uint(runtime.NumCPU())
}
log.Printf("Initializing internal llama server") log.Printf("Initializing internal llama server")
resp := newExtServerResp(128) resp := newExtServerResp(128)

View File

@ -1,4 +1,4 @@
From b5e195803e2a989e57eef0010adce778df1e2d01 Mon Sep 17 00:00:00 2001 From 7184ae16e8fd0e9e91cac4c81daa323057fa992b Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com> From: Daniel Hiltgen <daniel@ollama.com>
Date: Mon, 13 Nov 2023 12:25:58 -0800 Date: Mon, 13 Nov 2023 12:25:58 -0800
Subject: [PATCH] Expose callable API for server Subject: [PATCH] Expose callable API for server
@ -6,10 +6,10 @@ Subject: [PATCH] Expose callable API for server
This adds an extern "C" interface within the example server This adds an extern "C" interface within the example server
--- ---
examples/server/CMakeLists.txt | 24 +++ examples/server/CMakeLists.txt | 24 +++
examples/server/server.cpp | 274 +++++++++++++++++++++++++++++++++ examples/server/server.cpp | 276 +++++++++++++++++++++++++++++++++
examples/server/server.h | 89 +++++++++++ examples/server/server.h | 89 +++++++++++
ggml-cuda.cu | 1 + ggml-cuda.cu | 1 +
4 files changed, 388 insertions(+) 4 files changed, 390 insertions(+)
create mode 100644 examples/server/server.h create mode 100644 examples/server/server.h
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
@ -46,7 +46,7 @@ index 859cd12..4ea47a7 100644
+endif() +endif()
\ No newline at end of file \ No newline at end of file
diff --git a/examples/server/server.cpp b/examples/server/server.cpp diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 0403853..2084fd8 100644 index 0403853..065420c 100644
--- a/examples/server/server.cpp --- a/examples/server/server.cpp
+++ b/examples/server/server.cpp +++ b/examples/server/server.cpp
@@ -5,6 +5,9 @@ @@ -5,6 +5,9 @@
@ -67,7 +67,7 @@ index 0403853..2084fd8 100644
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
#if SERVER_VERBOSE != 1 #if SERVER_VERBOSE != 1
@@ -3123,3 +3127,273 @@ int main(int argc, char **argv) @@ -3123,3 +3127,275 @@ int main(int argc, char **argv)
llama_backend_free(); llama_backend_free();
return 0; return 0;
} }
@ -89,7 +89,9 @@ index 0403853..2084fd8 100644
+ gpt_params params; + gpt_params params;
+ params.n_ctx = sparams->n_ctx; + params.n_ctx = sparams->n_ctx;
+ params.n_batch = sparams->n_batch; + params.n_batch = sparams->n_batch;
+ params.n_threads = sparams->n_threads; + if (sparams->n_threads > 0) {
+ params.n_threads = sparams->n_threads;
+ }
+ params.n_parallel = sparams->n_parallel; + params.n_parallel = sparams->n_parallel;
+ params.rope_freq_base = sparams->rope_freq_base; + params.rope_freq_base = sparams->rope_freq_base;
+ params.rope_freq_scale = sparams->rope_freq_scale; + params.rope_freq_scale = sparams->rope_freq_scale;