From df3802a65fa5876708fa03e0369248bd0ea4b2c4 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 1 Aug 2024 17:22:25 -0700 Subject: [PATCH 01/71] Adjust arm cuda repo paths Ubuntu distros fail to install cuda drivers since aarch64 isn't valid --- scripts/install.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/install.sh b/scripts/install.sh index aa8b3e5e..03af5a69 100644 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -209,15 +209,15 @@ install_cuda_driver_yum() { case $PACKAGE_MANAGER in yum) $SUDO $PACKAGE_MANAGER -y install yum-utils - if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo" >/dev/null ; then - $SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo + if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m | sed -e 's/aarch64/sbsa/')/cuda-$1$2.repo" >/dev/null ; then + $SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m | sed -e 's/aarch64/sbsa/')/cuda-$1$2.repo else error $CUDA_REPO_ERR_MSG fi ;; dnf) - if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo" >/dev/null ; then - $SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo + if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m | sed -e 's/aarch64/sbsa/')/cuda-$1$2.repo" >/dev/null ; then + $SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m | sed -e 's/aarch64/sbsa/')/cuda-$1$2.repo else error $CUDA_REPO_ERR_MSG fi @@ -245,8 +245,8 @@ install_cuda_driver_yum() { # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#debian install_cuda_driver_apt() { status 'Installing NVIDIA repository...' - if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb" >/dev/null ; then - curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb + if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m | sed -e 's/aarch64/sbsa/')/cuda-keyring_1.1-1_all.deb" >/dev/null ; then + curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m | sed -e 's/aarch64/sbsa/')/cuda-keyring_1.1-1_all.deb else error $CUDA_REPO_ERR_MSG fi From a091fadfdaa2e4d6a34cf8bbfe4012913367a35a Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Fri, 2 Aug 2024 15:55:34 -0700 Subject: [PATCH 02/71] use testing tempdirs --- server/routes_create_test.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/server/routes_create_test.go b/server/routes_create_test.go index 9fd7f8cd..4de07b25 100644 --- a/server/routes_create_test.go +++ b/server/routes_create_test.go @@ -2,6 +2,7 @@ package server import ( "bytes" + "cmp" "encoding/json" "fmt" "io" @@ -53,6 +54,8 @@ func (t *responseRecorder) CloseNotify() <-chan bool { func createRequest(t *testing.T, fn func(*gin.Context), body any) *httptest.ResponseRecorder { t.Helper() + // if OLLAMA_MODELS is not set, set it to the temp directory + t.Setenv("OLLAMA_MODELS", cmp.Or(os.Getenv("OLLAMA_MODELS"), t.TempDir())) w := NewRecorder() c, _ := gin.CreateTestContext(w) From ed6c8bfe57e4678090b89fc8f6c4e08ce1b01040 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Mon, 5 Aug 2024 00:02:47 -0700 Subject: [PATCH 03/71] removeall to remove non-empty temp dirs --- gpu/assets.go | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/gpu/assets.go b/gpu/assets.go index a35b6630..1c33b55b 100644 --- a/gpu/assets.go +++ b/gpu/assets.go @@ -67,37 +67,44 @@ func PayloadsDir() (string, error) { // Best effort to clean up prior tmpdirs func cleanupTmpDirs() { - dirs, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*")) + matches, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*", "ollama.pid")) if err != nil { return } - for _, d := range dirs { - info, err := os.Stat(d) - if err != nil || !info.IsDir() { + + for _, match := range matches { + raw, err := os.ReadFile(match) + if errors.Is(err, os.ErrNotExist) { + slog.Debug("not a ollama runtime directory, skipping", "path", match) continue - } - raw, err := os.ReadFile(filepath.Join(d, "ollama.pid")) - if err != nil { - slog.Warn("failed to read ollama.pid", "path", d, "error", err) - // No pid, ignore this tmpdir + } else if err != nil { + slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err) continue } pid, err := strconv.Atoi(string(raw)) if err != nil { - slog.Warn("failed to parse pid", "path", d, "error", err) + slog.Warn("invalid pid, skipping", "path", match, "error", err) continue } - proc, err := os.FindProcess(pid) - if err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) { - slog.Warn("found running ollama", "pid", pid, "path", d) - // Another running ollama, ignore this tmpdir + p, err := os.FindProcess(pid) + if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) { + slog.Warn("process still running, skipping", "pid", pid, "path", match) continue } - if err := os.Remove(d); err != nil { - slog.Warn("unable to cleanup stale tmpdir", "path", d, "error", err) + if err := os.Remove(match); err != nil { + slog.Warn("could not cleanup stale pidfile", "path", match, "error", err) + } + + runners := filepath.Join(filepath.Dir(match), "runners") + if err := os.RemoveAll(runners); err != nil { + slog.Warn("could not cleanup stale runners", "path", runners, "error", err) + } + + if err := os.Remove(filepath.Dir(match)); err != nil { + slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err) } } } From 43f9d92008bf1aaa2e89ca50c85761540f70c21a Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Mon, 5 Aug 2024 00:34:09 -0700 Subject: [PATCH 04/71] close pid file --- gpu/assets.go | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/gpu/assets.go b/gpu/assets.go index 1c33b55b..6d62d0dc 100644 --- a/gpu/assets.go +++ b/gpu/assets.go @@ -49,13 +49,9 @@ func PayloadsDir() (string, error) { } // Track our pid so we can clean up orphaned tmpdirs - pidFilePath := filepath.Join(tmpDir, "ollama.pid") - pidFile, err := os.OpenFile(pidFilePath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, os.ModePerm) - if err != nil { - return "", err - } - if _, err := pidFile.Write([]byte(strconv.Itoa(os.Getpid()))); err != nil { - return "", err + n := filepath.Join(tmpDir, "ollama.pid") + if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil { + return "", fmt.Errorf("failed to write pid file %s: %w", n, err) } // We create a distinct subdirectory for payloads within the tmpdir From 04210aa6ddf9ec5d5b6101f6e8a12b68d7aadfee Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 5 Aug 2024 09:28:07 -0700 Subject: [PATCH 05/71] Catch one more error log --- llm/status.go | 1 + 1 file changed, 1 insertion(+) diff --git a/llm/status.go b/llm/status.go index d9f36115..604fe9e0 100644 --- a/llm/status.go +++ b/llm/status.go @@ -26,6 +26,7 @@ var errorPrefixes = []string{ "cudaMalloc failed", "\"ERR\"", "error loading model", + "GGML_ASSERT", } func (w *StatusWriter) Write(b []byte) (int, error) { From f457d63400f9859acdfff1853c53af13429acea5 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 5 Aug 2024 12:56:20 -0700 Subject: [PATCH 06/71] Implement linux NUMA detection If the system has multiple numa nodes, enable numa support in llama.cpp If we detect numactl in the path, use that, else use the basic "distribute" mode. --- api/types.go | 2 -- gpu/cpu_common.go | 21 +++++++++++++++++++++ llm/server.go | 10 ++++++++-- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/api/types.go b/api/types.go index c2529652..291522a3 100644 --- a/api/types.go +++ b/api/types.go @@ -231,7 +231,6 @@ type Options struct { // Runner options which must be set when the model is loaded into memory type Runner struct { - UseNUMA bool `json:"numa,omitempty"` NumCtx int `json:"num_ctx,omitempty"` NumBatch int `json:"num_batch,omitempty"` NumGPU int `json:"num_gpu,omitempty"` @@ -615,7 +614,6 @@ func DefaultOptions() Options { F16KV: true, UseMLock: false, UseMMap: nil, - UseNUMA: false, }, } } diff --git a/gpu/cpu_common.go b/gpu/cpu_common.go index 63e88f25..34edcdc5 100644 --- a/gpu/cpu_common.go +++ b/gpu/cpu_common.go @@ -1,6 +1,11 @@ package gpu import ( + "os" + "path/filepath" + "runtime" + "strings" + "golang.org/x/sys/cpu" ) @@ -14,3 +19,19 @@ func GetCPUCapability() CPUCapability { // else LCD return CPUCapabilityNone } + +func IsNUMA() bool { + if runtime.GOOS != "linux" { + // numa support in llama.cpp is linux only + return false + } + ids := map[string]interface{}{} + packageIds, _ := filepath.Glob("/sys/devices/system/cpu/cpu*/topology/physical_package_id") + for _, packageId := range packageIds { + id, err := os.ReadFile(packageId) + if err == nil { + ids[strings.TrimSpace(string(id))] = struct{}{} + } + } + return len(ids) > 1 +} diff --git a/llm/server.go b/llm/server.go index 7abc3bd7..152b7582 100644 --- a/llm/server.go +++ b/llm/server.go @@ -256,8 +256,14 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr params = append(params, "--mlock") } - if opts.UseNUMA { - params = append(params, "--numa") + if gpu.IsNUMA() { + numaMode := "distribute" + if runtime.GOOS == "linux" { + if _, err := exec.LookPath("numactl"); err == nil { + numaMode = "numactl" + } + } + params = append(params, "--numa", numaMode) } params = append(params, "--parallel", strconv.Itoa(numParallel)) From 7ed367419e8fee28c393f1f80edfb5686fddaed6 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Mon, 5 Aug 2024 16:34:54 -0700 Subject: [PATCH 07/71] fix concurrency test --- integration/concurrency_test.go | 19 +++++++++---------- integration/llm_test.go | 4 ++-- integration/max_queue_test.go | 2 +- integration/utils_test.go | 10 +++++----- 4 files changed, 17 insertions(+), 18 deletions(-) diff --git a/integration/concurrency_test.go b/integration/concurrency_test.go index 81d0b587..42e9d074 100644 --- a/integration/concurrency_test.go +++ b/integration/concurrency_test.go @@ -5,6 +5,7 @@ package integration import ( "context" "log/slog" + "os" "strconv" "sync" "testing" @@ -13,7 +14,6 @@ import ( "github.com/stretchr/testify/require" "github.com/ollama/ollama/api" - "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" ) @@ -41,8 +41,8 @@ func TestMultiModelConcurrency(t *testing.T) { }, } resp = [2][]string{ - []string{"sunlight"}, - []string{"england", "english", "massachusetts", "pilgrims", "british"}, + {"sunlight"}, + {"england", "english", "massachusetts", "pilgrims", "british"}, } ) var wg sync.WaitGroup @@ -71,12 +71,11 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) { reqLimit := len(req) iterLimit := 5 - vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM - if vram != "" { - max, err := strconv.ParseUint(vram, 10, 64) + if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" { + maxVram, err := strconv.ParseUint(s, 10, 64) require.NoError(t, err) // Don't hammer on small VRAM cards... - if max < 4*1024*1024*1024 { + if maxVram < 4*format.GibiByte { reqLimit = min(reqLimit, 2) iterLimit = 2 } @@ -233,12 +232,12 @@ func TestMultiModelStress(t *testing.T) { consumed := uint64(256 * format.MebiByte) // Assume some baseline usage for i := 0; i < len(req); i++ { // Always get at least 2 models, but dont' overshoot VRAM too much or we'll take too long - if i > 1 && consumed > vram { - slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(vram), "models", format.HumanBytes2(consumed)) + if i > 1 && consumed > maxVram { + slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed)) break } consumed += chosenModels[i].size - slog.Info("target vram", "count", i, "vram", format.HumanBytes2(vram), "models", format.HumanBytes2(consumed)) + slog.Info("target vram", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed)) wg.Add(1) go func(i int) { diff --git a/integration/llm_test.go b/integration/llm_test.go index 4952b072..398e0a03 100644 --- a/integration/llm_test.go +++ b/integration/llm_test.go @@ -35,8 +35,8 @@ var ( }, } resp = [2][]string{ - []string{"sunlight"}, - []string{"england", "english", "massachusetts", "pilgrims"}, + {"sunlight"}, + {"england", "english", "massachusetts", "pilgrims"}, } ) diff --git a/integration/max_queue_test.go b/integration/max_queue_test.go index b06197e1..ec9e085a 100644 --- a/integration/max_queue_test.go +++ b/integration/max_queue_test.go @@ -29,7 +29,7 @@ func TestMaxQueue(t *testing.T) { // Also note that by default Darwin can't sustain > ~128 connections without adjusting limits threadCount := 32 if maxQueue := envconfig.MaxQueue(); maxQueue != 0 { - threadCount = maxQueue + threadCount = int(maxQueue) } else { t.Setenv("OLLAMA_MAX_QUEUE", strconv.Itoa(threadCount)) } diff --git a/integration/utils_test.go b/integration/utils_test.go index c2b27ee9..a6010995 100644 --- a/integration/utils_test.go +++ b/integration/utils_test.go @@ -334,10 +334,10 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) { }, }, [][]string{ - []string{"sunlight"}, - []string{"soil", "organic", "earth", "black", "tan"}, - []string{"england", "english", "massachusetts", "pilgrims", "british"}, - []string{"fourth", "july", "declaration", "independence"}, - []string{"nitrogen", "oxygen", "carbon", "dioxide"}, + {"sunlight"}, + {"soil", "organic", "earth", "black", "tan"}, + {"england", "english", "massachusetts", "pilgrims", "british"}, + {"fourth", "july", "declaration", "independence"}, + {"nitrogen", "oxygen", "carbon", "dioxide"}, } } From 86b907f82ad1cc5eb16e919d6cb5830765d73be4 Mon Sep 17 00:00:00 2001 From: royjhan <65097070+royjhan@users.noreply.github.com> Date: Mon, 5 Aug 2024 19:55:34 -0400 Subject: [PATCH 08/71] sort batch results (#6189) --- llm/ext_server/server.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index d72bb1b1..071fe1e7 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -44,6 +44,7 @@ #include #endif +#include #include #include #include @@ -1220,6 +1221,7 @@ struct llama_server_context res.result_json = json { + {"id", res.id}, {"embedding", std::vector(embd, embd + n_embd)}, {"timings", slot.get_formated_timings()}, }; @@ -3203,6 +3205,10 @@ int main(int argc, char **argv) { } responses = result.result_json.value("results", std::vector{result.result_json}); + std::sort(responses.begin(), responses.end(), [](const json& a, const json& b) { + return a["id"] < b["id"]; + }); + json embeddings = json::array(); int prompt_n = 0; From fc85f50a2be9ba8776547de9db02c5373719eb13 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 6 Aug 2024 10:46:31 -0700 Subject: [PATCH 09/71] Ensure sparse files on windows during download The file.Truncate call on windows will write the whole file unless you set the sparse flag, leading to heavy I/O at the beginning of download. This should improve our I/O behavior on windows and put less stress on the users disk. --- server/download.go | 3 +++ server/sparse_common.go | 9 +++++++++ server/sparse_windows.go | 16 ++++++++++++++++ 3 files changed, 28 insertions(+) create mode 100644 server/sparse_common.go create mode 100644 server/sparse_windows.go diff --git a/server/download.go b/server/download.go index a903d96f..38d24a6b 100644 --- a/server/download.go +++ b/server/download.go @@ -216,6 +216,9 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis return err } defer file.Close() + if err := setSparse(file); err != nil { + return err + } _ = file.Truncate(b.Total) diff --git a/server/sparse_common.go b/server/sparse_common.go new file mode 100644 index 00000000..f25627fc --- /dev/null +++ b/server/sparse_common.go @@ -0,0 +1,9 @@ +//go:build !windows + +package server + +import "os" + +func setSparse(file *os.File) error { + return nil +} diff --git a/server/sparse_windows.go b/server/sparse_windows.go new file mode 100644 index 00000000..cdad379e --- /dev/null +++ b/server/sparse_windows.go @@ -0,0 +1,16 @@ +package server + +import ( + "os" + + "golang.org/x/sys/windows" +) + +func setSparse(file *os.File) error { + return windows.DeviceIoControl( + windows.Handle(file.Fd()), windows.FSCTL_SET_SPARSE, + nil, 0, + nil, 0, + nil, nil, + ) +} From d4a7216c82bb406e644c739281ade3f7f2e283e5 Mon Sep 17 00:00:00 2001 From: Chua Chee Seng Date: Wed, 7 Aug 2024 02:37:16 +0800 Subject: [PATCH 10/71] Fixed invalid option provided not displaying the invalid option name problem. (#6202) --- api/types.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/types.go b/api/types.go index 291522a3..2f5a9424 100644 --- a/api/types.go +++ b/api/types.go @@ -504,7 +504,7 @@ func (opts *Options) FromMap(m map[string]interface{}) error { for key, val := range m { opt, ok := jsonOpts[key] if !ok { - slog.Warn("invalid option provided", "option", opt.Name) + slog.Warn("invalid option provided", "option", key) continue } From e04c7012c235d8972afe5538ff27802c77217b83 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Tue, 6 Aug 2024 15:11:45 -0400 Subject: [PATCH 11/71] update llama.cpp submodule to `1e6f6554` (#6208) --- llm/ext_server/server.cpp | 14 +++++++++++--- llm/llama.cpp | 2 +- llm/patches/09-lora.diff | 34 +++++++++++++--------------------- llm/patches/10-params.diff | 20 -------------------- 4 files changed, 25 insertions(+), 45 deletions(-) delete mode 100644 llm/patches/10-params.diff diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 071fe1e7..c65901c7 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -403,7 +403,9 @@ struct llama_server_context } } - std::tie(model, ctx) = llama_init_from_gpt_params(params); + auto init_result = llama_init_from_gpt_params(params); + model = init_result.model; + ctx = init_result.context; if (model == nullptr) { LOG_ERROR("unable to load model", {{"model", params.model}}); @@ -2422,7 +2424,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g invalid_param = true; break; } - params.lora_adapter.emplace_back(argv[i], 1.0f); + params.lora_adapters.push_back({ + std::string(argv[i]), + 1.0, + }); params.use_mmap = false; } else if (arg == "--lora-scaled") @@ -2438,7 +2443,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g invalid_param = true; break; } - params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); + params.lora_adapters.push_back({ + lora_adapter, + std::stof(argv[i]) + }); params.use_mmap = false; } else if (arg == "-v" || arg == "--verbose") diff --git a/llm/llama.cpp b/llm/llama.cpp index 6eeaeba1..1e6f6554 160000 --- a/llm/llama.cpp +++ b/llm/llama.cpp @@ -1 +1 @@ -Subproject commit 6eeaeba126ff701f3e8f79f246805b7023709972 +Subproject commit 1e6f6554aa11fa10160a5fda689e736c3c34169f diff --git a/llm/patches/09-lora.diff b/llm/patches/09-lora.diff index 10c66d1d..21958476 100644 --- a/llm/patches/09-lora.diff +++ b/llm/patches/09-lora.diff @@ -1,40 +1,32 @@ diff --git a/common/common.cpp b/common/common.cpp -index dbb724fb..c26fe6ee 100644 +index 2e8374d5..70d0afde 100644 --- a/common/common.cpp +++ b/common/common.cpp -@@ -2087,14 +2087,27 @@ std::tuple llama_init_from_gpt_par - for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { - const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); - float lora_scale = std::get<1>(params.lora_adapter[i]); -+ -+ // try to load as gguf - auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str()); - if (adapter == nullptr) { -- fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); +@@ -2110,9 +2110,21 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { + loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str()); + if (loaded_la.adapter == nullptr) { + fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); - llama_free(lctx); - llama_free_model(model); -- return std::make_tuple(nullptr, nullptr); -+ fprintf(stderr, "%s: error: failed to apply lora adapter, trying ggla\n", __func__); +- return iparams; + + // if that fails, try loading as ggla for compatibility + int err = llama_model_apply_lora_from_file(model, -+ lora_adapter.c_str(), -+ lora_scale, ++ la.path.c_str(), ++ la.scale, + nullptr, + params.n_threads); + if (err != 0) { + fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); + llama_free(lctx); + llama_free_model(model); -+ return std::make_tuple(nullptr, nullptr); ++ return iparams; ++ } else { ++ break; + } -+ } else { -+ llama_lora_adapter_set(lctx, adapter, lora_scale); } -- llama_lora_adapter_set(lctx, adapter, lora_scale); + iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters } - - if (params.ignore_eos) { diff --git a/include/llama.h b/include/llama.h index 93fd77ca..b0fb37a6 100644 --- a/include/llama.h @@ -355,4 +347,4 @@ index 80a0dd0f..9d7b0e17 100644 + return 1; + } +} -\ No newline at end of file +\ No newline at end of file \ No newline at end of file diff --git a/llm/patches/10-params.diff b/llm/patches/10-params.diff deleted file mode 100644 index 56699b8e..00000000 --- a/llm/patches/10-params.diff +++ /dev/null @@ -1,20 +0,0 @@ -diff --git a/src/llama.cpp b/src/llama.cpp -index a207451f..fba6b175 100644 ---- a/src/llama.cpp -+++ b/src/llama.cpp -@@ -4969,6 +4969,7 @@ static void llm_load_hparams( - hparams.attn_soft_cap = true; - - switch (hparams.n_layer) { -+ case 26: model.type = e_model::MODEL_2B; break; - case 42: model.type = e_model::MODEL_9B; break; - case 46: model.type = e_model::MODEL_27B; break; - default: model.type = e_model::MODEL_UNKNOWN; -@@ -11736,6 +11737,7 @@ struct llm_build_context { - - // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e - switch (model.type) { -+ case e_model::MODEL_2B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break; - case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break; - case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break; - default: GGML_ABORT("fatal error"); From de4fc297732cb60ff79a6c8010a7c79971c21b4a Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Tue, 6 Aug 2024 23:20:49 -0400 Subject: [PATCH 12/71] llm: reserve required number of slots for embeddings (#6219) --- llm/server.go | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/llm/server.go b/llm/server.go index 152b7582..41736068 100644 --- a/llm/server.go +++ b/llm/server.go @@ -44,11 +44,12 @@ type LlamaServer interface { // llmServer is an instance of the llama.cpp server type llmServer struct { - port int - cmd *exec.Cmd - done chan error // Channel to signal when the process exits - status *StatusWriter - options api.Options + port int + cmd *exec.Cmd + done chan error // Channel to signal when the process exits + status *StatusWriter + options api.Options + numParallel int estimate MemoryEstimate totalLayers uint64 @@ -343,6 +344,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr status: NewStatusWriter(os.Stderr), options: opts, estimate: estimate, + numParallel: numParallel, sem: semaphore.NewWeighted(int64(numParallel)), totalLayers: ggml.KV().BlockCount() + 1, gpus: gpus, @@ -890,11 +892,14 @@ type EmbedResponse struct { } func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse, error) { - if err := s.sem.Acquire(ctx, 1); err != nil { + // each input will use a slot, so we need to acquire the semaphore for + // the number of inputs up to numParallel + slots := int64(min(len(input), s.numParallel)) + if err := s.sem.Acquire(ctx, slots); err != nil { slog.Error("Failed to acquire semaphore", "error", err) return nil, err } - defer s.sem.Release(1) + defer s.sem.Release(slots) // Make sure the server is ready status, err := s.getServerStatusRetry(ctx) From 685a53534b80a14efdfdb09ca00af984782ba6ee Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Thu, 1 Aug 2024 15:05:16 -0700 Subject: [PATCH 13/71] manifest: Don't prune layers if we can't open a manifest file If there is an error when opening a manifest file (corrupted, permission denied, etc.) then the referenced layers will not be included in the list of active layers. This causes them to be deleted when pruning happens at startup or a model is pulled. In such a situation, we should prefer to preserve data in the hopes that it can be recovered rather than being agressive about deletion. --- server/images.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/server/images.go b/server/images.go index 81357f3c..05875a88 100644 --- a/server/images.go +++ b/server/images.go @@ -714,8 +714,7 @@ func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}) // save (i.e. delete from the deleteMap) any files used in other manifests manifest, _, err := GetManifest(fmp) if err != nil { - //nolint:nilerr - return nil + return err } for _, layer := range manifest.Layers { @@ -782,7 +781,8 @@ func PruneLayers() error { err = deleteUnusedLayers(nil, deleteMap) if err != nil { - return err + slog.Info(fmt.Sprintf("couldn't remove unused layers: %v", err)) + return nil } slog.Info(fmt.Sprintf("total unused blobs removed: %d", len(deleteMap))) @@ -971,7 +971,8 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu fn(api.ProgressResponse{Status: "removing any unused layers"}) err = deleteUnusedLayers(nil, deleteMap) if err != nil { - return err + slog.Info(fmt.Sprintf("couldn't remove unused layers: %v", err)) + fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't remove unused layers: %v", err)}) } } From ce67706037a2583157fcac4cbf6253fe0f1e5139 Mon Sep 17 00:00:00 2001 From: Nicholas Schwab Date: Wed, 7 Aug 2024 18:15:17 +0200 Subject: [PATCH 14/71] Set *.png and *.ico to be treated as binary files. The change b732beba6 makes all files text files and sets lf as eol. This will automatically change all files to have lf if they are touched by git (e.g. via git status). This change cannot be stashed and makes it hard to work with the repo (rebase and checkout don't really work). See also #6183. Here, we set the offending files (*.png and *.ico, but that might be more in the future) to be treated as binary files and not be changed by git. --- .gitattributes | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitattributes b/.gitattributes index f7192096..648c78ca 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,4 @@ llm/ext_server/* linguist-vendored * text eol=lf +*.png binary +*.ico binary From 1829fb61bd7a4186881714618f09b2877d0bc9a3 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Mon, 5 Aug 2024 17:13:52 -0700 Subject: [PATCH 15/71] manifest: Fix crash on startup when trying to clean up unused files (#5840) Currently if the config field is missing in the manifest file (or corrupted), Ollama will crash when it tries to read it. This can happen at startup or when pulling new models. This data is mostly just used for showing model information so we can be tolerant of it not being present - it is not required to run the models. Besides avoiding crashing, this also gives us the ability to restructure the config in the future by pulling it into the main manifest file. --- server/images.go | 40 ++++++++++++++++++++++++---------------- server/layer.go | 15 ++++++++++++++- server/manifest.go | 18 ++++++++++-------- server/routes.go | 23 +++++++++++++---------- 4 files changed, 61 insertions(+), 35 deletions(-) diff --git a/server/images.go b/server/images.go index 05875a88..7ed35995 100644 --- a/server/images.go +++ b/server/images.go @@ -250,19 +250,21 @@ func GetModel(name string) (*Model, error) { Template: template.DefaultTemplate, } - filename, err := GetBlobsPath(manifest.Config.Digest) - if err != nil { - return nil, err - } + if manifest.Config.Digest != "" { + filename, err := GetBlobsPath(manifest.Config.Digest) + if err != nil { + return nil, err + } - configFile, err := os.Open(filename) - if err != nil { - return nil, err - } - defer configFile.Close() + configFile, err := os.Open(filename) + if err != nil { + return nil, err + } + defer configFile.Close() - if err := json.NewDecoder(configFile).Decode(&model.Config); err != nil { - return nil, err + if err := json.NewDecoder(configFile).Decode(&model.Config); err != nil { + return nil, err + } } for _, layer := range manifest.Layers { @@ -781,7 +783,7 @@ func PruneLayers() error { err = deleteUnusedLayers(nil, deleteMap) if err != nil { - slog.Info(fmt.Sprintf("couldn't remove unused layers: %v", err)) + slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err)) return nil } @@ -839,7 +841,9 @@ func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn fu var layers []*Layer layers = append(layers, manifest.Layers...) - layers = append(layers, manifest.Config) + if manifest.Config.Digest != "" { + layers = append(layers, &manifest.Config) + } for _, layer := range layers { if err := uploadBlob(ctx, mp, layer, regOpts, fn); err != nil { @@ -890,7 +894,9 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu for _, l := range manifest.Layers { deleteMap[l.Digest] = struct{}{} } - deleteMap[manifest.Config.Digest] = struct{}{} + if manifest.Config.Digest != "" { + deleteMap[manifest.Config.Digest] = struct{}{} + } } } @@ -907,7 +913,9 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu var layers []*Layer layers = append(layers, manifest.Layers...) - layers = append(layers, manifest.Config) + if manifest.Config.Digest != "" { + layers = append(layers, &manifest.Config) + } skipVerify := make(map[string]bool) for _, layer := range layers { @@ -971,7 +979,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu fn(api.ProgressResponse{Status: "removing any unused layers"}) err = deleteUnusedLayers(nil, deleteMap) if err != nil { - slog.Info(fmt.Sprintf("couldn't remove unused layers: %v", err)) + slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err)) fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't remove unused layers: %v", err)}) } } diff --git a/server/layer.go b/server/layer.go index cc6709d2..a2b66782 100644 --- a/server/layer.go +++ b/server/layer.go @@ -2,6 +2,7 @@ package server import ( "crypto/sha256" + "errors" "fmt" "io" "os" @@ -61,6 +62,10 @@ func NewLayer(r io.Reader, mediatype string) (*Layer, error) { } func NewLayerFromLayer(digest, mediatype, from string) (*Layer, error) { + if digest == "" { + return nil, errors.New("creating new layer from layer with empty digest") + } + blob, err := GetBlobsPath(digest) if err != nil { return nil, err @@ -81,6 +86,10 @@ func NewLayerFromLayer(digest, mediatype, from string) (*Layer, error) { } func (l *Layer) Open() (io.ReadSeekCloser, error) { + if l.Digest == "" { + return nil, errors.New("opening layer with empty digest") + } + blob, err := GetBlobsPath(l.Digest) if err != nil { return nil, err @@ -90,13 +99,17 @@ func (l *Layer) Open() (io.ReadSeekCloser, error) { } func (l *Layer) Remove() error { + if l.Digest == "" { + return nil + } + ms, err := Manifests() if err != nil { return err } for _, m := range ms { - for _, layer := range append(m.Layers, m.Config) { + for _, layer := range append(m.Layers, &m.Config) { if layer.Digest == l.Digest { // something is using this layer return nil diff --git a/server/manifest.go b/server/manifest.go index b8df11ef..b966ddbe 100644 --- a/server/manifest.go +++ b/server/manifest.go @@ -16,7 +16,7 @@ import ( type Manifest struct { SchemaVersion int `json:"schemaVersion"` MediaType string `json:"mediaType"` - Config *Layer `json:"config"` + Config Layer `json:"config"` Layers []*Layer `json:"layers"` filepath string @@ -25,7 +25,7 @@ type Manifest struct { } func (m *Manifest) Size() (size int64) { - for _, layer := range append(m.Layers, m.Config) { + for _, layer := range append(m.Layers, &m.Config) { size += layer.Size } @@ -46,11 +46,13 @@ func (m *Manifest) Remove() error { } func (m *Manifest) RemoveLayers() error { - for _, layer := range append(m.Layers, m.Config) { - if err := layer.Remove(); errors.Is(err, os.ErrNotExist) { - slog.Debug("layer does not exist", "digest", layer.Digest) - } else if err != nil { - return err + for _, layer := range append(m.Layers, &m.Config) { + if layer.Digest != "" { + if err := layer.Remove(); errors.Is(err, os.ErrNotExist) { + slog.Debug("layer does not exist", "digest", layer.Digest) + } else if err != nil { + return err + } } } @@ -113,7 +115,7 @@ func WriteManifest(name model.Name, config *Layer, layers []*Layer) error { m := Manifest{ SchemaVersion: 2, MediaType: "application/vnd.docker.distribution.manifest.v2+json", - Config: config, + Config: *config, Layers: layers, } diff --git a/server/routes.go b/server/routes.go index b9c66b65..e55eaa9d 100644 --- a/server/routes.go +++ b/server/routes.go @@ -824,17 +824,20 @@ func (s *Server) ListModelsHandler(c *gin.Context) { models := []api.ListModelResponse{} for n, m := range ms { - f, err := m.Config.Open() - if err != nil { - slog.Warn("bad manifest filepath", "name", n, "error", err) - continue - } - defer f.Close() - var cf ConfigV2 - if err := json.NewDecoder(f).Decode(&cf); err != nil { - slog.Warn("bad manifest config", "name", n, "error", err) - continue + + if m.Config.Digest != "" { + f, err := m.Config.Open() + if err != nil { + slog.Warn("bad manifest filepath", "name", n, "error", err) + continue + } + defer f.Close() + + if err := json.NewDecoder(f).Decode(&cf); err != nil { + slog.Warn("bad manifest config", "name", n, "error", err) + continue + } } // tag should never be masked From ad0c19dde403ba67aa27247775e33c33c30ee235 Mon Sep 17 00:00:00 2001 From: Kyle Kelley Date: Wed, 7 Aug 2024 14:20:50 -0700 Subject: [PATCH 16/71] Use llama3.1 in tools example (#5985) * Use llama3.1 in tools example * Update api.md --- docs/api.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/api.md b/docs/api.md index c0202ef3..5cbba523 100644 --- a/docs/api.md +++ b/docs/api.md @@ -669,7 +669,7 @@ curl http://localhost:11434/api/chat -d '{ ``` curl http://localhost:11434/api/chat -d '{ - "model": "mistral", + "model": "llama3.1", "messages": [ { "role": "user", @@ -708,7 +708,7 @@ curl http://localhost:11434/api/chat -d '{ ```json { - "model": "mistral:7b-instruct-v0.3-q4_K_M", + "model": "llama3.1", "created_at": "2024-07-22T20:33:28.123648Z", "message": { "role": "assistant", From 5b3a21b578da89b1682a98ce123a6b3c91697e9b Mon Sep 17 00:00:00 2001 From: royjhan <65097070+royjhan@users.noreply.github.com> Date: Wed, 7 Aug 2024 17:43:44 -0400 Subject: [PATCH 17/71] add metrics to docs (#6079) --- docs/api.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/api.md b/docs/api.md index 5cbba523..aed2b69f 100644 --- a/docs/api.md +++ b/docs/api.md @@ -1175,7 +1175,10 @@ curl http://localhost:11434/api/embed -d '{ "embeddings": [[ 0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814, 0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348 - ]] + ]], + "total_duration": 14143917, + "load_duration": 1019500, + "prompt_eval_count": 8 } ``` From 97ec8cfd4ef13190f3939fbb24b6f146d570ed12 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 7 Aug 2024 11:44:25 -0700 Subject: [PATCH 18/71] image: Clarify argument to WriteManifest is config When creating a model the config layer is appended to the list of layers and then the last layer is used as the config when writing the manifest. This change directly uses the config layer to write the manifest. There is no behavior change but it is less error prone. --- server/images.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/server/images.go b/server/images.go index 7ed35995..4202a413 100644 --- a/server/images.go +++ b/server/images.go @@ -625,12 +625,12 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio return err } - layer, err := NewLayer(&b, "application/vnd.docker.container.image.v1+json") + configLayer, err := NewLayer(&b, "application/vnd.docker.container.image.v1+json") if err != nil { return err } - for _, layer := range append(layers, layer) { + for _, layer := range append(layers, configLayer) { if layer.status != "" { fn(api.ProgressResponse{Status: layer.status}) } @@ -639,7 +639,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio old, _ := ParseNamedManifest(name) fn(api.ProgressResponse{Status: "writing manifest"}) - if err := WriteManifest(name, layer, layers); err != nil { + if err := WriteManifest(name, configLayer, layers); err != nil { return err } From 7edaf6e7e8d79a9c88419988ae98afaf3fc32f15 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 7 Aug 2024 14:22:17 -0700 Subject: [PATCH 19/71] manifest: Store layers inside manifests consistently as values. Commit 1829fb61 ("manifest: Fix crash on startup when trying to clean up unused files (#5840)") changed the config layer stored in manifests from a pointer to a value. This was done in order to avoid potential nil pointer dereferences after it is deserialized from JSON in the event that the field is missing. This changes the Layers slice to also be stored by value. This enables consistency in handling across the two objects. --- server/images.go | 14 +++++++------- server/layer.go | 28 ++++++++++++++-------------- server/manifest.go | 16 ++++++++-------- server/model.go | 2 +- server/routes_delete_test.go | 2 +- server/upload.go | 4 ++-- 6 files changed, 33 insertions(+), 33 deletions(-) diff --git a/server/images.go b/server/images.go index 4202a413..0e753f56 100644 --- a/server/images.go +++ b/server/images.go @@ -373,7 +373,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio var messages []*api.Message parameters := make(map[string]any) - var layers []*Layer + var layers []Layer for _, c := range modelfile.Commands { mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name) @@ -499,7 +499,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio if c.Name != "license" { // replace - layers = slices.DeleteFunc(layers, func(layer *Layer) bool { + layers = slices.DeleteFunc(layers, func(layer Layer) bool { if layer.MediaType != mediatype { return false } @@ -545,7 +545,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio } var err2 error - layers = slices.DeleteFunc(layers, func(layer *Layer) bool { + layers = slices.DeleteFunc(layers, func(layer Layer) bool { switch layer.MediaType { case "application/vnd.ollama.image.message": // if there are new messages, remove the inherited ones @@ -839,10 +839,10 @@ func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn fu return err } - var layers []*Layer + var layers []Layer layers = append(layers, manifest.Layers...) if manifest.Config.Digest != "" { - layers = append(layers, &manifest.Config) + layers = append(layers, manifest.Config) } for _, layer := range layers { @@ -911,10 +911,10 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu return fmt.Errorf("pull model manifest: %s", err) } - var layers []*Layer + var layers []Layer layers = append(layers, manifest.Layers...) if manifest.Config.Digest != "" { - layers = append(layers, &manifest.Config) + layers = append(layers, manifest.Config) } skipVerify := make(map[string]bool) diff --git a/server/layer.go b/server/layer.go index a2b66782..c666bd10 100644 --- a/server/layer.go +++ b/server/layer.go @@ -16,15 +16,15 @@ type Layer struct { status string } -func NewLayer(r io.Reader, mediatype string) (*Layer, error) { +func NewLayer(r io.Reader, mediatype string) (Layer, error) { blobs, err := GetBlobsPath("") if err != nil { - return nil, err + return Layer{}, err } temp, err := os.CreateTemp(blobs, "sha256-") if err != nil { - return nil, err + return Layer{}, err } defer temp.Close() defer os.Remove(temp.Name()) @@ -32,28 +32,28 @@ func NewLayer(r io.Reader, mediatype string) (*Layer, error) { sha256sum := sha256.New() n, err := io.Copy(io.MultiWriter(temp, sha256sum), r) if err != nil { - return nil, err + return Layer{}, err } if err := temp.Close(); err != nil { - return nil, err + return Layer{}, err } digest := fmt.Sprintf("sha256:%x", sha256sum.Sum(nil)) blob, err := GetBlobsPath(digest) if err != nil { - return nil, err + return Layer{}, err } status := "using existing layer" if _, err := os.Stat(blob); err != nil { status = "creating new layer" if err := os.Rename(temp.Name(), blob); err != nil { - return nil, err + return Layer{}, err } } - return &Layer{ + return Layer{ MediaType: mediatype, Digest: digest, Size: n, @@ -61,22 +61,22 @@ func NewLayer(r io.Reader, mediatype string) (*Layer, error) { }, nil } -func NewLayerFromLayer(digest, mediatype, from string) (*Layer, error) { +func NewLayerFromLayer(digest, mediatype, from string) (Layer, error) { if digest == "" { - return nil, errors.New("creating new layer from layer with empty digest") + return Layer{}, errors.New("creating new layer from layer with empty digest") } blob, err := GetBlobsPath(digest) if err != nil { - return nil, err + return Layer{}, err } fi, err := os.Stat(blob) if err != nil { - return nil, err + return Layer{}, err } - return &Layer{ + return Layer{ MediaType: mediatype, Digest: digest, Size: fi.Size(), @@ -109,7 +109,7 @@ func (l *Layer) Remove() error { } for _, m := range ms { - for _, layer := range append(m.Layers, &m.Config) { + for _, layer := range append(m.Layers, m.Config) { if layer.Digest == l.Digest { // something is using this layer return nil diff --git a/server/manifest.go b/server/manifest.go index b966ddbe..6a5d7b88 100644 --- a/server/manifest.go +++ b/server/manifest.go @@ -14,10 +14,10 @@ import ( ) type Manifest struct { - SchemaVersion int `json:"schemaVersion"` - MediaType string `json:"mediaType"` - Config Layer `json:"config"` - Layers []*Layer `json:"layers"` + SchemaVersion int `json:"schemaVersion"` + MediaType string `json:"mediaType"` + Config Layer `json:"config"` + Layers []Layer `json:"layers"` filepath string fi os.FileInfo @@ -25,7 +25,7 @@ type Manifest struct { } func (m *Manifest) Size() (size int64) { - for _, layer := range append(m.Layers, &m.Config) { + for _, layer := range append(m.Layers, m.Config) { size += layer.Size } @@ -46,7 +46,7 @@ func (m *Manifest) Remove() error { } func (m *Manifest) RemoveLayers() error { - for _, layer := range append(m.Layers, &m.Config) { + for _, layer := range append(m.Layers, m.Config) { if layer.Digest != "" { if err := layer.Remove(); errors.Is(err, os.ErrNotExist) { slog.Debug("layer does not exist", "digest", layer.Digest) @@ -95,7 +95,7 @@ func ParseNamedManifest(n model.Name) (*Manifest, error) { return &m, nil } -func WriteManifest(name model.Name, config *Layer, layers []*Layer) error { +func WriteManifest(name model.Name, config Layer, layers []Layer) error { manifests, err := GetManifestPath() if err != nil { return err @@ -115,7 +115,7 @@ func WriteManifest(name model.Name, config *Layer, layers []*Layer) error { m := Manifest{ SchemaVersion: 2, MediaType: "application/vnd.docker.distribution.manifest.v2+json", - Config: *config, + Config: config, Layers: layers, } diff --git a/server/model.go b/server/model.go index f2946a0b..ad6e4e55 100644 --- a/server/model.go +++ b/server/model.go @@ -26,7 +26,7 @@ import ( var intermediateBlobs map[string]string = make(map[string]string) type layerGGML struct { - *Layer + Layer *llm.GGML } diff --git a/server/routes_delete_test.go b/server/routes_delete_test.go index 1c950d66..82fac9f5 100644 --- a/server/routes_delete_test.go +++ b/server/routes_delete_test.go @@ -98,7 +98,7 @@ func TestDeleteDuplicateLayers(t *testing.T) { } // create a manifest with duplicate layers - if err := WriteManifest(n, config, []*Layer{config}); err != nil { + if err := WriteManifest(n, config, []Layer{config}); err != nil { t.Fatal(err) } diff --git a/server/upload.go b/server/upload.go index b5a244ea..2f115436 100644 --- a/server/upload.go +++ b/server/upload.go @@ -26,7 +26,7 @@ import ( var blobUploadManager sync.Map type blobUpload struct { - *Layer + Layer Total int64 Completed atomic.Int64 @@ -362,7 +362,7 @@ func (p *progressWriter) Rollback() { p.written = 0 } -func uploadBlob(ctx context.Context, mp ModelPath, layer *Layer, opts *registryOptions, fn func(api.ProgressResponse)) error { +func uploadBlob(ctx context.Context, mp ModelPath, layer Layer, opts *registryOptions, fn func(api.ProgressResponse)) error { requestURL := mp.BaseURL() requestURL = requestURL.JoinPath("v2", mp.GetNamespaceRepository(), "blobs", layer.Digest) From 7b61eba47159748bcfc35227a13e31c899a84e49 Mon Sep 17 00:00:00 2001 From: Jitang Lei Date: Thu, 8 Aug 2024 20:28:01 +0800 Subject: [PATCH 20/71] server/download.go: Fix a typo in log Signed-off-by: Jitang Lei --- server/download.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/download.go b/server/download.go index 38d24a6b..cf31df5e 100644 --- a/server/download.go +++ b/server/download.go @@ -235,7 +235,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis newOpts.CheckRedirect = func(req *http.Request, via []*http.Request) error { if len(via) > 10 { - return errors.New("maxium redirects exceeded (10) for directURL") + return errors.New("maximum redirects exceeded (10) for directURL") } // if the hostname is the same, allow the redirect From 67472e0e89f516ccfbfad2d11414aadf484b7642 Mon Sep 17 00:00:00 2001 From: Nicholas42 Date: Fri, 9 Aug 2024 13:41:20 +0200 Subject: [PATCH 21/71] Also flag *.icns as binary --- .gitattributes | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitattributes b/.gitattributes index 648c78ca..baabd3c1 100644 --- a/.gitattributes +++ b/.gitattributes @@ -2,3 +2,4 @@ llm/ext_server/* linguist-vendored * text eol=lf *.png binary *.ico binary +*.icns binary From 5bca2e60a7baefe582077469a1d14ff516b5d322 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 9 Aug 2024 11:31:38 -0700 Subject: [PATCH 22/71] Harden intel boostrap for nil pointers --- gpu/gpu.go | 61 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/gpu/gpu.go b/gpu/gpu.go index 7ae8fbec..dc124a3e 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -305,38 +305,41 @@ func GetGPUInfo() GpuInfoList { // Intel if envconfig.IntelGPU() { oHandles = initOneAPIHandles() - // On windows we bundle the oneapi library one level above the runner dir - depPath = "" - if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" { - depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "oneapi") - } + if oHandles != nil && oHandles.oneapi != nil { - for d := range oHandles.oneapi.num_drivers { - if oHandles.oneapi == nil { - // shouldn't happen - slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers)) - continue + // On windows we bundle the oneapi library one level above the runner dir + depPath = "" + if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" { + depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "oneapi") } - devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d)) - for i := range devCount { - gpuInfo := OneapiGPUInfo{ - GpuInfo: GpuInfo{ - Library: "oneapi", - }, - driverIndex: int(d), - gpuIndex: int(i), + + for d := range oHandles.oneapi.num_drivers { + if oHandles.oneapi == nil { + // shouldn't happen + slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers)) + continue + } + devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d)) + for i := range devCount { + gpuInfo := OneapiGPUInfo{ + GpuInfo: GpuInfo{ + Library: "oneapi", + }, + driverIndex: int(d), + gpuIndex: int(i), + } + // TODO - split bootstrapping from updating free memory + C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo) + // TODO - convert this to MinimumMemory based on testing... + var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend. + memInfo.free = C.uint64_t(totalFreeMem) + gpuInfo.TotalMemory = uint64(memInfo.total) + gpuInfo.FreeMemory = uint64(memInfo.free) + gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) + gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) + gpuInfo.DependencyPath = depPath + oneapiGPUs = append(oneapiGPUs, gpuInfo) } - // TODO - split bootstrapping from updating free memory - C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo) - // TODO - convert this to MinimumMemory based on testing... - var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend. - memInfo.free = C.uint64_t(totalFreeMem) - gpuInfo.TotalMemory = uint64(memInfo.total) - gpuInfo.FreeMemory = uint64(memInfo.free) - gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) - gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) - gpuInfo.DependencyPath = depPath - oneapiGPUs = append(oneapiGPUs, gpuInfo) } } } From 2fa1db434581bcfcb6fec1482904656e4b5f8313 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 9 Aug 2024 11:57:48 -0700 Subject: [PATCH 23/71] Don't hard fail on sparse setup error It seems this can fail in some casees, but proceed with the download anyway. --- server/download.go | 4 +--- server/sparse_common.go | 3 +-- server/sparse_windows.go | 5 +++-- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/server/download.go b/server/download.go index 38d24a6b..5965b322 100644 --- a/server/download.go +++ b/server/download.go @@ -216,9 +216,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis return err } defer file.Close() - if err := setSparse(file); err != nil { - return err - } + setSparse(file) _ = file.Truncate(b.Total) diff --git a/server/sparse_common.go b/server/sparse_common.go index f25627fc..c88b2da0 100644 --- a/server/sparse_common.go +++ b/server/sparse_common.go @@ -4,6 +4,5 @@ package server import "os" -func setSparse(file *os.File) error { - return nil +func setSparse(*os.File) { } diff --git a/server/sparse_windows.go b/server/sparse_windows.go index cdad379e..f21cbbda 100644 --- a/server/sparse_windows.go +++ b/server/sparse_windows.go @@ -6,8 +6,9 @@ import ( "golang.org/x/sys/windows" ) -func setSparse(file *os.File) error { - return windows.DeviceIoControl( +func setSparse(file *os.File) { + // exFat (and other FS types) don't support sparse files, so ignore errors + windows.DeviceIoControl( //nolint:errcheck windows.Handle(file.Fd()), windows.FSCTL_SET_SPARSE, nil, 0, nil, 0, From d4e640746469ac586f12b400384c4ae7354d9280 Mon Sep 17 00:00:00 2001 From: Nicholas Schwab Date: Fri, 9 Aug 2024 23:14:13 +0200 Subject: [PATCH 24/71] Restrict text files with explicit line feeds to *.go. This partially reverts b732beba6a919b852539bb344b05e25c6a7c3c90. It seems like explicitly setting all files to use line feeds was done due to issues with the go linter, hence it can be restricted to those files (https://github.com/ollama/ollama/pull/6235#issuecomment-2278745953). --- .gitattributes | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.gitattributes b/.gitattributes index baabd3c1..f1c8bcb4 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,5 +1,3 @@ llm/ext_server/* linguist-vendored -* text eol=lf -*.png binary -*.ico binary -*.icns binary +* text=auto +*.go text eol=lf From 023451ce471e7781bee65505011c48b9e5541811 Mon Sep 17 00:00:00 2001 From: CognitiveTech Date: Sat, 10 Aug 2024 21:43:08 -0400 Subject: [PATCH 25/71] add integration obook-summary (#6305) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 7c606e1c..aae92e6c 100644 --- a/README.md +++ b/README.md @@ -325,6 +325,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [tlm](https://github.com/yusufcanb/tlm) - [podman-ollama](https://github.com/ericcurtin/podman-ollama) - [gollama](https://github.com/sammcj/gollama) +- [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/) ### Database From 25906d72d1482bc9dc2e4300a42c8db4823ee1a3 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Sun, 11 Aug 2024 11:30:20 -0700 Subject: [PATCH 26/71] llm: prevent loading too large models on windows (#5926) Don't allow loading models that would lead to memory exhaustion (across vram, system memory and disk paging). This check was already applied on Linux but should also be applied on Windows as well. --- llm/server.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llm/server.go b/llm/server.go index 41736068..0bd94f35 100644 --- a/llm/server.go +++ b/llm/server.go @@ -125,8 +125,9 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr } } - // On linux, over-allocating CPU memory will almost always result in an error - if runtime.GOOS == "linux" { + // On linux and windows, over-allocating CPU memory will almost always result in an error + // Darwin has fully dynamic swap so has no direct concept of free swap space + if runtime.GOOS != "darwin" { systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize available := systemFreeMemory + systemSwapFreeMemory if systemMemoryRequired > available { From 15c2d8fe149ba2b58aadbab615a6955f8821c7a9 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sun, 11 Aug 2024 11:57:10 -0700 Subject: [PATCH 27/71] server: parallelize embeddings in API web handler instead of in subprocess runner (#6220) For simplicity, perform parallelization of embedding requests in the API handler instead of offloading this to the subprocess runner. This keeps the scheduling story simpler as it builds on existing parallel requests, similar to existing text completion functionality. --- llm/ext_server/server.cpp | 42 ++++++++------------------------------- llm/server.go | 32 +++++++++++++---------------- server/routes.go | 42 +++++++++++++++++++++++++-------------- server/sched_test.go | 8 ++++---- 4 files changed, 53 insertions(+), 71 deletions(-) diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index c65901c7..5717c17a 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -1223,9 +1223,7 @@ struct llama_server_context res.result_json = json { - {"id", res.id}, {"embedding", std::vector(embd, embd + n_embd)}, - {"timings", slot.get_formated_timings()}, }; } } @@ -3194,41 +3192,17 @@ int main(int argc, char **argv) { prompt = ""; } - if (prompt.size() == 1) { - prompt = prompt[0]; - } - // create and queue the task - json responses; - { - const int id_task = llama.queue_tasks.get_new_id(); - llama.queue_results.add_waiting_task_id(id_task); - llama.request_completion(id_task, {{"prompt", prompt}}, true, -1); + const int task_id = llama.queue_tasks.get_new_id(); + llama.queue_results.add_waiting_task_id(task_id); + llama.request_completion(task_id, {{"prompt", prompt}}, true, -1); - // get the result - task_result result = llama.queue_results.recv(id_task); - llama.queue_results.remove_waiting_task_id(id_task); - if (result.error) { - return res.set_content(result.result_json.dump(), "application/json; charset=utf-8"); - } + // get the result + task_result result = llama.queue_results.recv(task_id); + llama.queue_results.remove_waiting_task_id(task_id); - responses = result.result_json.value("results", std::vector{result.result_json}); - std::sort(responses.begin(), responses.end(), [](const json& a, const json& b) { - return a["id"] < b["id"]; - }); - - json embeddings = json::array(); - - int prompt_n = 0; - for (auto & elem : responses) { - embeddings.push_back(elem.at("embedding")); - prompt_n += elem.at("timings").at("prompt_n").get(); - } - - // send the result - json embedding_res = json{{"embedding", embeddings}, {"prompt_n", prompt_n}}; - return res.set_content(embedding_res.dump(), "application/json; charset=utf-8"); - } + // send the result + return res.set_content(result.result_json.dump(), "application/json; charset=utf-8"); }); // GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!? diff --git a/llm/server.go b/llm/server.go index 0bd94f35..d2b8db9b 100644 --- a/llm/server.go +++ b/llm/server.go @@ -33,7 +33,7 @@ type LlamaServer interface { Ping(ctx context.Context) error WaitUntilRunning(ctx context.Context) error Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error - Embed(ctx context.Context, input []string) (*EmbedResponse, error) + Embedding(ctx context.Context, input string) ([]float32, error) Tokenize(ctx context.Context, content string) ([]int, error) Detokenize(ctx context.Context, tokens []int) (string, error) Close() error @@ -883,24 +883,20 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu return nil } -type EmbedRequest struct { - Content []string `json:"content"` +type EmbeddingRequest struct { + Content string `json:"content"` } -type EmbedResponse struct { - Embedding [][]float32 `json:"embedding"` - PromptEvalCount int `json:"prompt_n"` +type EmbeddingResponse struct { + Embedding []float32 `json:"embedding"` } -func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse, error) { - // each input will use a slot, so we need to acquire the semaphore for - // the number of inputs up to numParallel - slots := int64(min(len(input), s.numParallel)) - if err := s.sem.Acquire(ctx, slots); err != nil { +func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) { + if err := s.sem.Acquire(ctx, 1); err != nil { slog.Error("Failed to acquire semaphore", "error", err) return nil, err } - defer s.sem.Release(slots) + defer s.sem.Release(1) // Make sure the server is ready status, err := s.getServerStatusRetry(ctx) @@ -910,18 +906,18 @@ func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse, return nil, fmt.Errorf("unexpected server status: %s", status.ToString()) } - data, err := json.Marshal(EmbedRequest{Content: input}) + data, err := json.Marshal(EmbeddingRequest{Content: input}) if err != nil { return nil, fmt.Errorf("error marshaling embed data: %w", err) } - req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/embedding", s.port), bytes.NewBuffer(data)) + r, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/embedding", s.port), bytes.NewBuffer(data)) if err != nil { return nil, fmt.Errorf("error creating embed request: %w", err) } - req.Header.Set("Content-Type", "application/json") + r.Header.Set("Content-Type", "application/json") - resp, err := http.DefaultClient.Do(req) + resp, err := http.DefaultClient.Do(r) if err != nil { return nil, fmt.Errorf("do embedding request: %w", err) } @@ -937,12 +933,12 @@ func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse, return nil, fmt.Errorf("%s", body) } - var e EmbedResponse + var e EmbeddingResponse if err := json.Unmarshal(body, &e); err != nil { return nil, fmt.Errorf("unmarshal tokenize response: %w", err) } - return &e, nil + return e.Embedding, nil } type TokenizeRequest struct { diff --git a/server/routes.go b/server/routes.go index e55eaa9d..e5a31002 100644 --- a/server/routes.go +++ b/server/routes.go @@ -23,6 +23,7 @@ import ( "github.com/gin-contrib/cors" "github.com/gin-gonic/gin" + "golang.org/x/sync/errgroup" "github.com/ollama/ollama/api" "github.com/ollama/ollama/envconfig" @@ -346,6 +347,7 @@ func (s *Server) EmbedHandler(c *gin.Context) { return } + var count int for i, s := range input { tokens, err := r.Tokenize(c.Request.Context(), s) if err != nil { @@ -368,25 +370,36 @@ func (s *Server) EmbedHandler(c *gin.Context) { } } + count += len(tokens) + input[i] = s } - embeddings, err := r.Embed(c.Request.Context(), input) - if err != nil { - slog.Error("embedding generation failed", "error", err) - c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"}) - return + + var g errgroup.Group + embeddings := make([][]float32, len(input)) + for i, text := range input { + g.Go(func() error { + embedding, err := r.Embedding(c.Request.Context(), text) + if err != nil { + return err + } + embeddings[i] = normalize(embedding) + return nil + }) } - for i, e := range embeddings.Embedding { - embeddings.Embedding[i] = normalize(e) + if err := g.Wait(); err != nil { + slog.Error("embedding generation failed", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": fmt.Errorf("failed to generate embeddings: %v", err)}) + return } resp := api.EmbedResponse{ Model: req.Model, - Embeddings: embeddings.Embedding, + Embeddings: embeddings, TotalDuration: time.Since(checkpointStart), LoadDuration: checkpointLoaded.Sub(checkpointStart), - PromptEvalCount: embeddings.PromptEvalCount, + PromptEvalCount: count, } c.JSON(http.StatusOK, resp) } @@ -430,21 +443,20 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) { return } - embeddings, err := r.Embed(c.Request.Context(), []string{req.Prompt}) + embedding, err := r.Embedding(c.Request.Context(), req.Prompt) if err != nil { slog.Info(fmt.Sprintf("embedding generation failed: %v", err)) c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"}) return } - embedding := make([]float64, len(embeddings.Embedding[0])) - - for i, v := range embeddings.Embedding[0] { - embedding[i] = float64(v) + var e []float64 + for _, v := range embedding { + e = append(e, float64(v)) } resp := api.EmbeddingResponse{ - Embedding: embedding, + Embedding: e, } c.JSON(http.StatusOK, resp) } diff --git a/server/sched_test.go b/server/sched_test.go index c8717430..713b9259 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -708,8 +708,8 @@ type mockLlm struct { pingResp error waitResp error completionResp error - embedResp *llm.EmbedResponse - embedRespErr error + embeddingResp []float32 + embeddingRespErr error tokenizeResp []int tokenizeRespErr error detokenizeResp string @@ -727,8 +727,8 @@ func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn return s.completionResp } -func (s *mockLlm) Embed(ctx context.Context, input []string) (*llm.EmbedResponse, error) { - return s.embedResp, s.embedRespErr +func (s *mockLlm) Embedding(ctx context.Context, input string) ([]float32, error) { + return s.embeddingResp, s.embeddingRespErr } func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) { From 8aac22438ef34192ff804dbeb1b5e9a7e180eb7c Mon Sep 17 00:00:00 2001 From: Josh <76125168+joshyan1@users.noreply.github.com> Date: Mon, 12 Aug 2024 09:28:55 -0700 Subject: [PATCH 28/71] server: speed up single gguf creates (#5898) --- server/model.go | 17 +++++++-- server/model_test.go | 82 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 3 deletions(-) diff --git a/server/model.go b/server/model.go index ad6e4e55..8d7ed7e6 100644 --- a/server/model.go +++ b/server/model.go @@ -176,9 +176,20 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap mediatype = "application/vnd.ollama.image.projector" } - layer, err := NewLayer(io.NewSectionReader(file, offset, n), mediatype) - if err != nil { - return nil, err + var layer *Layer + if digest != "" && n == stat.Size() && offset == 0 { + layer, err = NewLayerFromLayer(digest, mediatype, file.Name()) + if err != nil { + slog.Debug("could not create new layer from layer", "error", err) + } + } + + // Fallback to creating layer from file copy (either NewLayerFromLayer failed, or digest empty/n != stat.Size()) + if layer == nil { + layer, err = NewLayer(io.NewSectionReader(file, offset, n), mediatype) + if err != nil { + return nil, err + } } layers = append(layers, &layerGGML{layer, ggml}) diff --git a/server/model_test.go b/server/model_test.go index aa214d3d..63fc408d 100644 --- a/server/model_test.go +++ b/server/model_test.go @@ -2,8 +2,10 @@ package server import ( "bytes" + "context" "encoding/json" "fmt" + "io" "os" "path/filepath" "testing" @@ -11,6 +13,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/ollama/ollama/api" + "github.com/ollama/ollama/llm" "github.com/ollama/ollama/template" ) @@ -133,3 +136,82 @@ The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`, }) } } + +func TestParseFromFileFromLayer(t *testing.T) { + tempModels := t.TempDir() + + file, err := os.CreateTemp(tempModels, "") + if err != nil { + t.Fatalf("failed to open file: %v", err) + } + defer file.Close() + if err := llm.WriteGGUF(file, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil { + t.Fatalf("failed to write gguf: %v", err) + } + + if _, err := file.Seek(0, io.SeekStart); err != nil { + t.Fatalf("failed to seek to start: %v", err) + } + + layers, err := parseFromFile(context.Background(), file, "", func(api.ProgressResponse) {}) + if err != nil { + t.Fatalf("failed to parse from file: %v", err) + } + + if len(layers) != 1 { + t.Fatalf("got %d != want 1", len(layers)) + } + + if _, err := file.Seek(0, io.SeekStart); err != nil { + t.Fatalf("failed to seek to start: %v", err) + } + + layers2, err := parseFromFile(context.Background(), file, layers[0].Digest, func(api.ProgressResponse) {}) + if err != nil { + t.Fatalf("failed to parse from file: %v", err) + } + if len(layers2) != 1 { + t.Fatalf("got %d != want 1", len(layers2)) + } + + if layers[0].Digest != layers2[0].Digest { + t.Fatalf("got %s != want %s", layers[0].Digest, layers2[0].Digest) + } + + if layers[0].Size != layers2[0].Size { + t.Fatalf("got %d != want %d", layers[0].Size, layers2[0].Size) + } + + if layers[0].MediaType != layers2[0].MediaType { + t.Fatalf("got %v != want %v", layers[0].MediaType, layers2[0].MediaType) + } +} + +func TestParseLayerFromCopy(t *testing.T) { + tempModels := t.TempDir() + + file2, err := os.CreateTemp(tempModels, "") + if err != nil { + t.Fatalf("failed to open file: %v", err) + } + defer file2.Close() + + for range 5 { + if err := llm.WriteGGUF(file2, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil { + t.Fatalf("failed to write gguf: %v", err) + } + } + + if _, err := file2.Seek(0, io.SeekStart); err != nil { + t.Fatalf("failed to seek to start: %v", err) + } + + layers, err := parseFromFile(context.Background(), file2, "", func(api.ProgressResponse) {}) + if err != nil { + t.Fatalf("failed to parse from file: %v", err) + } + + if len(layers) != 5 { + t.Fatalf("got %d != want 5", len(layers)) + } +} From 1dc3ef3aa9d451a63fcb6ea2e1b6ea5289a1a325 Mon Sep 17 00:00:00 2001 From: Josh <76125168+joshyan1@users.noreply.github.com> Date: Mon, 12 Aug 2024 09:57:51 -0700 Subject: [PATCH 29/71] Revert "server: speed up single gguf creates (#5898)" (#6323) This reverts commit 8aac22438ef34192ff804dbeb1b5e9a7e180eb7c. --- server/model.go | 17 ++------- server/model_test.go | 82 -------------------------------------------- 2 files changed, 3 insertions(+), 96 deletions(-) diff --git a/server/model.go b/server/model.go index 8d7ed7e6..ad6e4e55 100644 --- a/server/model.go +++ b/server/model.go @@ -176,20 +176,9 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap mediatype = "application/vnd.ollama.image.projector" } - var layer *Layer - if digest != "" && n == stat.Size() && offset == 0 { - layer, err = NewLayerFromLayer(digest, mediatype, file.Name()) - if err != nil { - slog.Debug("could not create new layer from layer", "error", err) - } - } - - // Fallback to creating layer from file copy (either NewLayerFromLayer failed, or digest empty/n != stat.Size()) - if layer == nil { - layer, err = NewLayer(io.NewSectionReader(file, offset, n), mediatype) - if err != nil { - return nil, err - } + layer, err := NewLayer(io.NewSectionReader(file, offset, n), mediatype) + if err != nil { + return nil, err } layers = append(layers, &layerGGML{layer, ggml}) diff --git a/server/model_test.go b/server/model_test.go index 63fc408d..aa214d3d 100644 --- a/server/model_test.go +++ b/server/model_test.go @@ -2,10 +2,8 @@ package server import ( "bytes" - "context" "encoding/json" "fmt" - "io" "os" "path/filepath" "testing" @@ -13,7 +11,6 @@ import ( "github.com/google/go-cmp/cmp" "github.com/ollama/ollama/api" - "github.com/ollama/ollama/llm" "github.com/ollama/ollama/template" ) @@ -136,82 +133,3 @@ The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`, }) } } - -func TestParseFromFileFromLayer(t *testing.T) { - tempModels := t.TempDir() - - file, err := os.CreateTemp(tempModels, "") - if err != nil { - t.Fatalf("failed to open file: %v", err) - } - defer file.Close() - if err := llm.WriteGGUF(file, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil { - t.Fatalf("failed to write gguf: %v", err) - } - - if _, err := file.Seek(0, io.SeekStart); err != nil { - t.Fatalf("failed to seek to start: %v", err) - } - - layers, err := parseFromFile(context.Background(), file, "", func(api.ProgressResponse) {}) - if err != nil { - t.Fatalf("failed to parse from file: %v", err) - } - - if len(layers) != 1 { - t.Fatalf("got %d != want 1", len(layers)) - } - - if _, err := file.Seek(0, io.SeekStart); err != nil { - t.Fatalf("failed to seek to start: %v", err) - } - - layers2, err := parseFromFile(context.Background(), file, layers[0].Digest, func(api.ProgressResponse) {}) - if err != nil { - t.Fatalf("failed to parse from file: %v", err) - } - if len(layers2) != 1 { - t.Fatalf("got %d != want 1", len(layers2)) - } - - if layers[0].Digest != layers2[0].Digest { - t.Fatalf("got %s != want %s", layers[0].Digest, layers2[0].Digest) - } - - if layers[0].Size != layers2[0].Size { - t.Fatalf("got %d != want %d", layers[0].Size, layers2[0].Size) - } - - if layers[0].MediaType != layers2[0].MediaType { - t.Fatalf("got %v != want %v", layers[0].MediaType, layers2[0].MediaType) - } -} - -func TestParseLayerFromCopy(t *testing.T) { - tempModels := t.TempDir() - - file2, err := os.CreateTemp(tempModels, "") - if err != nil { - t.Fatalf("failed to open file: %v", err) - } - defer file2.Close() - - for range 5 { - if err := llm.WriteGGUF(file2, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil { - t.Fatalf("failed to write gguf: %v", err) - } - } - - if _, err := file2.Seek(0, io.SeekStart); err != nil { - t.Fatalf("failed to seek to start: %v", err) - } - - layers, err := parseFromFile(context.Background(), file2, "", func(api.ProgressResponse) {}) - if err != nil { - t.Fatalf("failed to parse from file: %v", err) - } - - if len(layers) != 5 { - t.Fatalf("got %d != want 5", len(layers)) - } -} From 01d544d373d0f7782a9da2a830e0e7fa6926a584 Mon Sep 17 00:00:00 2001 From: royjhan <65097070+royjhan@users.noreply.github.com> Date: Mon, 12 Aug 2024 13:33:34 -0400 Subject: [PATCH 30/71] OpenAI: Simplify input output in testing (#5858) * simplify input output * direct comp * in line image * rm error pointer type * update response testing * lint --- openai/openai_test.go | 668 ++++++++++++++++++++++-------------------- 1 file changed, 344 insertions(+), 324 deletions(-) diff --git a/openai/openai_test.go b/openai/openai_test.go index e08a96c9..c7e9f384 100644 --- a/openai/openai_test.go +++ b/openai/openai_test.go @@ -7,27 +7,22 @@ import ( "io" "net/http" "net/http/httptest" + "reflect" "strings" "testing" "time" "github.com/gin-gonic/gin" - "github.com/stretchr/testify/assert" "github.com/ollama/ollama/api" ) const ( - prefix = `data:image/jpeg;base64,` - image = `iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=` - imageURL = prefix + image + prefix = `data:image/jpeg;base64,` + image = `iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=` ) -func prepareRequest(req *http.Request, body any) { - bodyBytes, _ := json.Marshal(body) - req.Body = io.NopCloser(bytes.NewReader(bodyBytes)) - req.Header.Set("Content-Type", "application/json") -} +var False = false func captureRequestMiddleware(capturedRequest any) gin.HandlerFunc { return func(c *gin.Context) { @@ -43,134 +38,136 @@ func captureRequestMiddleware(capturedRequest any) gin.HandlerFunc { func TestChatMiddleware(t *testing.T) { type testCase struct { - Name string - Setup func(t *testing.T, req *http.Request) - Expected func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) + name string + body string + req api.ChatRequest + err ErrorResponse } var capturedRequest *api.ChatRequest testCases := []testCase{ { - Name: "chat handler", - Setup: func(t *testing.T, req *http.Request) { - body := ChatCompletionRequest{ - Model: "test-model", - Messages: []Message{{Role: "user", Content: "Hello"}}, - } - prepareRequest(req, body) - }, - Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) { - if resp.Code != http.StatusOK { - t.Fatalf("expected 200, got %d", resp.Code) - } - - if req.Messages[0].Role != "user" { - t.Fatalf("expected 'user', got %s", req.Messages[0].Role) - } - - if req.Messages[0].Content != "Hello" { - t.Fatalf("expected 'Hello', got %s", req.Messages[0].Content) - } + name: "chat handler", + body: `{ + "model": "test-model", + "messages": [ + {"role": "user", "content": "Hello"} + ] + }`, + req: api.ChatRequest{ + Model: "test-model", + Messages: []api.Message{ + { + Role: "user", + Content: "Hello", + }, + }, + Options: map[string]any{ + "temperature": 1.0, + "top_p": 1.0, + }, + Stream: &False, }, }, { - Name: "chat handler with image content", - Setup: func(t *testing.T, req *http.Request) { - body := ChatCompletionRequest{ - Model: "test-model", - Messages: []Message{ - { - Role: "user", Content: []map[string]any{ - {"type": "text", "text": "Hello"}, - {"type": "image_url", "image_url": map[string]string{"url": imageURL}}, + name: "chat handler with image content", + body: `{ + "model": "test-model", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Hello" + }, + { + "type": "image_url", + "image_url": { + "url": "` + prefix + image + `" + } + } + ] + } + ] + }`, + req: api.ChatRequest{ + Model: "test-model", + Messages: []api.Message{ + { + Role: "user", + Content: "Hello", + }, + { + Role: "user", + Images: []api.ImageData{ + func() []byte { + img, _ := base64.StdEncoding.DecodeString(image) + return img + }(), + }, + }, + }, + Options: map[string]any{ + "temperature": 1.0, + "top_p": 1.0, + }, + Stream: &False, + }, + }, + { + name: "chat handler with tools", + body: `{ + "model": "test-model", + "messages": [ + {"role": "user", "content": "What's the weather like in Paris Today?"}, + {"role": "assistant", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]} + ] + }`, + req: api.ChatRequest{ + Model: "test-model", + Messages: []api.Message{ + { + Role: "user", + Content: "What's the weather like in Paris Today?", + }, + { + Role: "assistant", + ToolCalls: []api.ToolCall{ + { + Function: api.ToolCallFunction{ + Name: "get_current_weather", + Arguments: map[string]interface{}{ + "location": "Paris, France", + "format": "celsius", + }, + }, }, }, }, - } - prepareRequest(req, body) - }, - Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) { - if resp.Code != http.StatusOK { - t.Fatalf("expected 200, got %d", resp.Code) - } - - if req.Messages[0].Role != "user" { - t.Fatalf("expected 'user', got %s", req.Messages[0].Role) - } - - if req.Messages[0].Content != "Hello" { - t.Fatalf("expected 'Hello', got %s", req.Messages[0].Content) - } - - img, _ := base64.StdEncoding.DecodeString(imageURL[len(prefix):]) - - if req.Messages[1].Role != "user" { - t.Fatalf("expected 'user', got %s", req.Messages[1].Role) - } - - if !bytes.Equal(req.Messages[1].Images[0], img) { - t.Fatalf("expected image encoding, got %s", req.Messages[1].Images[0]) - } + }, + Options: map[string]any{ + "temperature": 1.0, + "top_p": 1.0, + }, + Stream: &False, }, }, + { - Name: "chat handler with tools", - Setup: func(t *testing.T, req *http.Request) { - body := ChatCompletionRequest{ - Model: "test-model", - Messages: []Message{ - {Role: "user", Content: "What's the weather like in Paris Today?"}, - {Role: "assistant", ToolCalls: []ToolCall{{ - ID: "id", - Type: "function", - Function: struct { - Name string `json:"name"` - Arguments string `json:"arguments"` - }{ - Name: "get_current_weather", - Arguments: "{\"location\": \"Paris, France\", \"format\": \"celsius\"}", - }, - }}}, - }, - } - prepareRequest(req, body) - }, - Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) { - if resp.Code != 200 { - t.Fatalf("expected 200, got %d", resp.Code) - } - - if req.Messages[0].Content != "What's the weather like in Paris Today?" { - t.Fatalf("expected What's the weather like in Paris Today?, got %s", req.Messages[0].Content) - } - - if req.Messages[1].ToolCalls[0].Function.Arguments["location"] != "Paris, France" { - t.Fatalf("expected 'Paris, France', got %v", req.Messages[1].ToolCalls[0].Function.Arguments["location"]) - } - - if req.Messages[1].ToolCalls[0].Function.Arguments["format"] != "celsius" { - t.Fatalf("expected celsius, got %v", req.Messages[1].ToolCalls[0].Function.Arguments["format"]) - } - }, - }, - { - Name: "chat handler error forwarding", - Setup: func(t *testing.T, req *http.Request) { - body := ChatCompletionRequest{ - Model: "test-model", - Messages: []Message{{Role: "user", Content: 2}}, - } - prepareRequest(req, body) - }, - Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) { - if resp.Code != http.StatusBadRequest { - t.Fatalf("expected 400, got %d", resp.Code) - } - - if !strings.Contains(resp.Body.String(), "invalid message content type") { - t.Fatalf("error was not forwarded") - } + name: "chat handler error forwarding", + body: `{ + "model": "test-model", + "messages": [ + {"role": "user", "content": 2} + ] + }`, + err: ErrorResponse{ + Error: Error{ + Message: "invalid message content type: float64", + Type: "invalid_request_error", + }, }, }, } @@ -185,16 +182,26 @@ func TestChatMiddleware(t *testing.T) { router.Handle(http.MethodPost, "/api/chat", endpoint) for _, tc := range testCases { - t.Run(tc.Name, func(t *testing.T) { - req, _ := http.NewRequest(http.MethodPost, "/api/chat", nil) - - tc.Setup(t, req) + t.Run(tc.name, func(t *testing.T) { + req, _ := http.NewRequest(http.MethodPost, "/api/chat", strings.NewReader(tc.body)) + req.Header.Set("Content-Type", "application/json") resp := httptest.NewRecorder() router.ServeHTTP(resp, req) - tc.Expected(t, capturedRequest, resp) + var errResp ErrorResponse + if resp.Code != http.StatusOK { + if err := json.Unmarshal(resp.Body.Bytes(), &errResp); err != nil { + t.Fatal(err) + } + } + if capturedRequest != nil && !reflect.DeepEqual(tc.req, *capturedRequest) { + t.Fatal("requests did not match") + } + if !reflect.DeepEqual(tc.err, errResp) { + t.Fatal("errors did not match") + } capturedRequest = nil }) } @@ -202,71 +209,52 @@ func TestChatMiddleware(t *testing.T) { func TestCompletionsMiddleware(t *testing.T) { type testCase struct { - Name string - Setup func(t *testing.T, req *http.Request) - Expected func(t *testing.T, req *api.GenerateRequest, resp *httptest.ResponseRecorder) + name string + body string + req api.GenerateRequest + err ErrorResponse } var capturedRequest *api.GenerateRequest testCases := []testCase{ { - Name: "completions handler", - Setup: func(t *testing.T, req *http.Request) { - temp := float32(0.8) - body := CompletionRequest{ - Model: "test-model", - Prompt: "Hello", - Temperature: &temp, - Stop: []string{"\n", "stop"}, - Suffix: "suffix", - } - prepareRequest(req, body) - }, - Expected: func(t *testing.T, req *api.GenerateRequest, resp *httptest.ResponseRecorder) { - if req.Prompt != "Hello" { - t.Fatalf("expected 'Hello', got %s", req.Prompt) - } - - if req.Options["temperature"] != 1.6 { - t.Fatalf("expected 1.6, got %f", req.Options["temperature"]) - } - - stopTokens, ok := req.Options["stop"].([]any) - - if !ok { - t.Fatalf("expected stop tokens to be a list") - } - - if stopTokens[0] != "\n" || stopTokens[1] != "stop" { - t.Fatalf("expected ['\\n', 'stop'], got %v", stopTokens) - } - - if req.Suffix != "suffix" { - t.Fatalf("expected 'suffix', got %s", req.Suffix) - } + name: "completions handler", + body: `{ + "model": "test-model", + "prompt": "Hello", + "temperature": 0.8, + "stop": ["\n", "stop"], + "suffix": "suffix" + }`, + req: api.GenerateRequest{ + Model: "test-model", + Prompt: "Hello", + Options: map[string]any{ + "frequency_penalty": 0.0, + "presence_penalty": 0.0, + "temperature": 1.6, + "top_p": 1.0, + "stop": []any{"\n", "stop"}, + }, + Suffix: "suffix", + Stream: &False, }, }, { - Name: "completions handler error forwarding", - Setup: func(t *testing.T, req *http.Request) { - body := CompletionRequest{ - Model: "test-model", - Prompt: "Hello", - Temperature: nil, - Stop: []int{1, 2}, - Suffix: "suffix", - } - prepareRequest(req, body) - }, - Expected: func(t *testing.T, req *api.GenerateRequest, resp *httptest.ResponseRecorder) { - if resp.Code != http.StatusBadRequest { - t.Fatalf("expected 400, got %d", resp.Code) - } - - if !strings.Contains(resp.Body.String(), "invalid type for 'stop' field") { - t.Fatalf("error was not forwarded") - } + name: "completions handler error forwarding", + body: `{ + "model": "test-model", + "prompt": "Hello", + "temperature": null, + "stop": [1, 2], + "suffix": "suffix" + }`, + err: ErrorResponse{ + Error: Error{ + Message: "invalid type for 'stop' field: float64", + Type: "invalid_request_error", + }, }, }, } @@ -281,15 +269,27 @@ func TestCompletionsMiddleware(t *testing.T) { router.Handle(http.MethodPost, "/api/generate", endpoint) for _, tc := range testCases { - t.Run(tc.Name, func(t *testing.T) { - req, _ := http.NewRequest(http.MethodPost, "/api/generate", nil) - - tc.Setup(t, req) + t.Run(tc.name, func(t *testing.T) { + req, _ := http.NewRequest(http.MethodPost, "/api/generate", strings.NewReader(tc.body)) + req.Header.Set("Content-Type", "application/json") resp := httptest.NewRecorder() router.ServeHTTP(resp, req) - tc.Expected(t, capturedRequest, resp) + var errResp ErrorResponse + if resp.Code != http.StatusOK { + if err := json.Unmarshal(resp.Body.Bytes(), &errResp); err != nil { + t.Fatal(err) + } + } + + if capturedRequest != nil && !reflect.DeepEqual(tc.req, *capturedRequest) { + t.Fatal("requests did not match") + } + + if !reflect.DeepEqual(tc.err, errResp) { + t.Fatal("errors did not match") + } capturedRequest = nil }) @@ -298,78 +298,47 @@ func TestCompletionsMiddleware(t *testing.T) { func TestEmbeddingsMiddleware(t *testing.T) { type testCase struct { - Name string - Setup func(t *testing.T, req *http.Request) - Expected func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder) + name string + body string + req api.EmbedRequest + err ErrorResponse } var capturedRequest *api.EmbedRequest testCases := []testCase{ { - Name: "embed handler single input", - Setup: func(t *testing.T, req *http.Request) { - body := EmbedRequest{ - Input: "Hello", - Model: "test-model", - } - prepareRequest(req, body) - }, - Expected: func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder) { - if req.Input != "Hello" { - t.Fatalf("expected 'Hello', got %s", req.Input) - } - - if req.Model != "test-model" { - t.Fatalf("expected 'test-model', got %s", req.Model) - } + name: "embed handler single input", + body: `{ + "input": "Hello", + "model": "test-model" + }`, + req: api.EmbedRequest{ + Input: "Hello", + Model: "test-model", }, }, { - Name: "embed handler batch input", - Setup: func(t *testing.T, req *http.Request) { - body := EmbedRequest{ - Input: []string{"Hello", "World"}, - Model: "test-model", - } - prepareRequest(req, body) - }, - Expected: func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder) { - input, ok := req.Input.([]any) - - if !ok { - t.Fatalf("expected input to be a list") - } - - if input[0].(string) != "Hello" { - t.Fatalf("expected 'Hello', got %s", input[0]) - } - - if input[1].(string) != "World" { - t.Fatalf("expected 'World', got %s", input[1]) - } - - if req.Model != "test-model" { - t.Fatalf("expected 'test-model', got %s", req.Model) - } + name: "embed handler batch input", + body: `{ + "input": ["Hello", "World"], + "model": "test-model" + }`, + req: api.EmbedRequest{ + Input: []any{"Hello", "World"}, + Model: "test-model", }, }, { - Name: "embed handler error forwarding", - Setup: func(t *testing.T, req *http.Request) { - body := EmbedRequest{ - Model: "test-model", - } - prepareRequest(req, body) - }, - Expected: func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder) { - if resp.Code != http.StatusBadRequest { - t.Fatalf("expected 400, got %d", resp.Code) - } - - if !strings.Contains(resp.Body.String(), "invalid input") { - t.Fatalf("error was not forwarded") - } + name: "embed handler error forwarding", + body: `{ + "model": "test-model" + }`, + err: ErrorResponse{ + Error: Error{ + Message: "invalid input", + Type: "invalid_request_error", + }, }, }, } @@ -384,116 +353,167 @@ func TestEmbeddingsMiddleware(t *testing.T) { router.Handle(http.MethodPost, "/api/embed", endpoint) for _, tc := range testCases { - t.Run(tc.Name, func(t *testing.T) { - req, _ := http.NewRequest(http.MethodPost, "/api/embed", nil) - - tc.Setup(t, req) + t.Run(tc.name, func(t *testing.T) { + req, _ := http.NewRequest(http.MethodPost, "/api/embed", strings.NewReader(tc.body)) + req.Header.Set("Content-Type", "application/json") resp := httptest.NewRecorder() router.ServeHTTP(resp, req) - tc.Expected(t, capturedRequest, resp) + var errResp ErrorResponse + if resp.Code != http.StatusOK { + if err := json.Unmarshal(resp.Body.Bytes(), &errResp); err != nil { + t.Fatal(err) + } + } + + if capturedRequest != nil && !reflect.DeepEqual(tc.req, *capturedRequest) { + t.Fatal("requests did not match") + } + + if !reflect.DeepEqual(tc.err, errResp) { + t.Fatal("errors did not match") + } capturedRequest = nil }) } } -func TestMiddlewareResponses(t *testing.T) { +func TestListMiddleware(t *testing.T) { type testCase struct { - Name string - Method string - Path string - TestPath string - Handler func() gin.HandlerFunc - Endpoint func(c *gin.Context) - Setup func(t *testing.T, req *http.Request) - Expected func(t *testing.T, resp *httptest.ResponseRecorder) + name string + endpoint func(c *gin.Context) + resp string } testCases := []testCase{ { - Name: "list handler", - Method: http.MethodGet, - Path: "/api/tags", - TestPath: "/api/tags", - Handler: ListMiddleware, - Endpoint: func(c *gin.Context) { + name: "list handler", + endpoint: func(c *gin.Context) { c.JSON(http.StatusOK, api.ListResponse{ Models: []api.ListModelResponse{ { - Name: "Test Model", + Name: "test-model", + ModifiedAt: time.Unix(int64(1686935002), 0).UTC(), }, }, }) }, - Expected: func(t *testing.T, resp *httptest.ResponseRecorder) { - var listResp ListCompletion - if err := json.NewDecoder(resp.Body).Decode(&listResp); err != nil { - t.Fatal(err) - } - - if listResp.Object != "list" { - t.Fatalf("expected list, got %s", listResp.Object) - } - - if len(listResp.Data) != 1 { - t.Fatalf("expected 1, got %d", len(listResp.Data)) - } - - if listResp.Data[0].Id != "Test Model" { - t.Fatalf("expected Test Model, got %s", listResp.Data[0].Id) - } - }, + resp: `{ + "object": "list", + "data": [ + { + "id": "test-model", + "object": "model", + "created": 1686935002, + "owned_by": "library" + } + ] + }`, }, { - Name: "retrieve model", - Method: http.MethodGet, - Path: "/api/show/:model", - TestPath: "/api/show/test-model", - Handler: RetrieveMiddleware, - Endpoint: func(c *gin.Context) { - c.JSON(http.StatusOK, api.ShowResponse{ - ModifiedAt: time.Date(2024, 6, 17, 13, 45, 0, 0, time.UTC), - }) - }, - Expected: func(t *testing.T, resp *httptest.ResponseRecorder) { - var retrieveResp Model - if err := json.NewDecoder(resp.Body).Decode(&retrieveResp); err != nil { - t.Fatal(err) - } - - if retrieveResp.Object != "model" { - t.Fatalf("Expected object to be model, got %s", retrieveResp.Object) - } - - if retrieveResp.Id != "test-model" { - t.Fatalf("Expected id to be test-model, got %s", retrieveResp.Id) - } + name: "list handler empty output", + endpoint: func(c *gin.Context) { + c.JSON(http.StatusOK, api.ListResponse{}) }, + resp: `{ + "object": "list", + "data": null + }`, }, } gin.SetMode(gin.TestMode) - router := gin.New() for _, tc := range testCases { - t.Run(tc.Name, func(t *testing.T) { - router = gin.New() - router.Use(tc.Handler()) - router.Handle(tc.Method, tc.Path, tc.Endpoint) - req, _ := http.NewRequest(tc.Method, tc.TestPath, nil) + router := gin.New() + router.Use(ListMiddleware()) + router.Handle(http.MethodGet, "/api/tags", tc.endpoint) + req, _ := http.NewRequest(http.MethodGet, "/api/tags", nil) - if tc.Setup != nil { - tc.Setup(t, req) - } + resp := httptest.NewRecorder() + router.ServeHTTP(resp, req) - resp := httptest.NewRecorder() - router.ServeHTTP(resp, req) + var expected, actual map[string]any + err := json.Unmarshal([]byte(tc.resp), &expected) + if err != nil { + t.Fatalf("failed to unmarshal expected response: %v", err) + } - assert.Equal(t, http.StatusOK, resp.Code) + err = json.Unmarshal(resp.Body.Bytes(), &actual) + if err != nil { + t.Fatalf("failed to unmarshal actual response: %v", err) + } - tc.Expected(t, resp) - }) + if !reflect.DeepEqual(expected, actual) { + t.Errorf("responses did not match\nExpected: %+v\nActual: %+v", expected, actual) + } + } +} + +func TestRetrieveMiddleware(t *testing.T) { + type testCase struct { + name string + endpoint func(c *gin.Context) + resp string + } + + testCases := []testCase{ + { + name: "retrieve handler", + endpoint: func(c *gin.Context) { + c.JSON(http.StatusOK, api.ShowResponse{ + ModifiedAt: time.Unix(int64(1686935002), 0).UTC(), + }) + }, + resp: `{ + "id":"test-model", + "object":"model", + "created":1686935002, + "owned_by":"library"} + `, + }, + { + name: "retrieve handler error forwarding", + endpoint: func(c *gin.Context) { + c.JSON(http.StatusBadRequest, gin.H{"error": "model not found"}) + }, + resp: `{ + "error": { + "code": null, + "message": "model not found", + "param": null, + "type": "api_error" + } + }`, + }, + } + + gin.SetMode(gin.TestMode) + + for _, tc := range testCases { + router := gin.New() + router.Use(RetrieveMiddleware()) + router.Handle(http.MethodGet, "/api/show/:model", tc.endpoint) + req, _ := http.NewRequest(http.MethodGet, "/api/show/test-model", nil) + + resp := httptest.NewRecorder() + router.ServeHTTP(resp, req) + + var expected, actual map[string]any + err := json.Unmarshal([]byte(tc.resp), &expected) + if err != nil { + t.Fatalf("failed to unmarshal expected response: %v", err) + } + + err = json.Unmarshal(resp.Body.Bytes(), &actual) + if err != nil { + t.Fatalf("failed to unmarshal actual response: %v", err) + } + + if !reflect.DeepEqual(expected, actual) { + t.Errorf("responses did not match\nExpected: %+v\nActual: %+v", expected, actual) + } } } From 980dd15f81e9021c5165a1e516748d42cf134339 Mon Sep 17 00:00:00 2001 From: Josh <76125168+joshyan1@users.noreply.github.com> Date: Mon, 12 Aug 2024 11:46:09 -0700 Subject: [PATCH 31/71] cmd: speed up gguf creates (#6324) --- server/model.go | 17 +++++++-- server/model_test.go | 82 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 3 deletions(-) diff --git a/server/model.go b/server/model.go index ad6e4e55..b17bf0e3 100644 --- a/server/model.go +++ b/server/model.go @@ -176,9 +176,20 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap mediatype = "application/vnd.ollama.image.projector" } - layer, err := NewLayer(io.NewSectionReader(file, offset, n), mediatype) - if err != nil { - return nil, err + var layer Layer + if digest != "" && n == stat.Size() && offset == 0 { + layer, err = NewLayerFromLayer(digest, mediatype, file.Name()) + if err != nil { + slog.Debug("could not create new layer from layer", "error", err) + } + } + + // Fallback to creating layer from file copy (either NewLayerFromLayer failed, or digest empty/n != stat.Size()) + if layer.Digest == "" { + layer, err = NewLayer(io.NewSectionReader(file, offset, n), mediatype) + if err != nil { + return nil, err + } } layers = append(layers, &layerGGML{layer, ggml}) diff --git a/server/model_test.go b/server/model_test.go index aa214d3d..63fc408d 100644 --- a/server/model_test.go +++ b/server/model_test.go @@ -2,8 +2,10 @@ package server import ( "bytes" + "context" "encoding/json" "fmt" + "io" "os" "path/filepath" "testing" @@ -11,6 +13,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/ollama/ollama/api" + "github.com/ollama/ollama/llm" "github.com/ollama/ollama/template" ) @@ -133,3 +136,82 @@ The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`, }) } } + +func TestParseFromFileFromLayer(t *testing.T) { + tempModels := t.TempDir() + + file, err := os.CreateTemp(tempModels, "") + if err != nil { + t.Fatalf("failed to open file: %v", err) + } + defer file.Close() + if err := llm.WriteGGUF(file, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil { + t.Fatalf("failed to write gguf: %v", err) + } + + if _, err := file.Seek(0, io.SeekStart); err != nil { + t.Fatalf("failed to seek to start: %v", err) + } + + layers, err := parseFromFile(context.Background(), file, "", func(api.ProgressResponse) {}) + if err != nil { + t.Fatalf("failed to parse from file: %v", err) + } + + if len(layers) != 1 { + t.Fatalf("got %d != want 1", len(layers)) + } + + if _, err := file.Seek(0, io.SeekStart); err != nil { + t.Fatalf("failed to seek to start: %v", err) + } + + layers2, err := parseFromFile(context.Background(), file, layers[0].Digest, func(api.ProgressResponse) {}) + if err != nil { + t.Fatalf("failed to parse from file: %v", err) + } + if len(layers2) != 1 { + t.Fatalf("got %d != want 1", len(layers2)) + } + + if layers[0].Digest != layers2[0].Digest { + t.Fatalf("got %s != want %s", layers[0].Digest, layers2[0].Digest) + } + + if layers[0].Size != layers2[0].Size { + t.Fatalf("got %d != want %d", layers[0].Size, layers2[0].Size) + } + + if layers[0].MediaType != layers2[0].MediaType { + t.Fatalf("got %v != want %v", layers[0].MediaType, layers2[0].MediaType) + } +} + +func TestParseLayerFromCopy(t *testing.T) { + tempModels := t.TempDir() + + file2, err := os.CreateTemp(tempModels, "") + if err != nil { + t.Fatalf("failed to open file: %v", err) + } + defer file2.Close() + + for range 5 { + if err := llm.WriteGGUF(file2, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil { + t.Fatalf("failed to write gguf: %v", err) + } + } + + if _, err := file2.Seek(0, io.SeekStart); err != nil { + t.Fatalf("failed to seek to start: %v", err) + } + + layers, err := parseFromFile(context.Background(), file2, "", func(api.ProgressResponse) {}) + if err != nil { + t.Fatalf("failed to parse from file: %v", err) + } + + if len(layers) != 5 { + t.Fatalf("got %d != want 5", len(layers)) + } +} From f7e3b9190f7e8f99bac8af432b9539e24cd3b57e Mon Sep 17 00:00:00 2001 From: Josh <76125168+joshyan1@users.noreply.github.com> Date: Mon, 12 Aug 2024 11:46:32 -0700 Subject: [PATCH 32/71] cmd: spinner progress for transfer model data (#6100) --- cmd/cmd.go | 45 ++++++++++++++++++++++++++++++++++++++++++--- progress/spinner.go | 14 ++++++++++---- 2 files changed, 52 insertions(+), 7 deletions(-) diff --git a/cmd/cmd.go b/cmd/cmd.go index d47db65b..2356110e 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -22,6 +22,7 @@ import ( "runtime" "slices" "strings" + "sync/atomic" "syscall" "time" @@ -78,6 +79,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error { status := "transferring model data" spinner := progress.NewSpinner(status) p.Add(status, spinner) + defer p.Stop() for i := range modelfile.Commands { switch modelfile.Commands[i].Name { @@ -112,7 +114,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error { path = tempfile } - digest, err := createBlob(cmd, client, path) + digest, err := createBlob(cmd, client, path, spinner) if err != nil { return err } @@ -263,13 +265,20 @@ func tempZipFiles(path string) (string, error) { return tempfile.Name(), nil } -func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, error) { +func createBlob(cmd *cobra.Command, client *api.Client, path string, spinner *progress.Spinner) (string, error) { bin, err := os.Open(path) if err != nil { return "", err } defer bin.Close() + // Get file info to retrieve the size + fileInfo, err := bin.Stat() + if err != nil { + return "", err + } + fileSize := fileInfo.Size() + hash := sha256.New() if _, err := io.Copy(hash, bin); err != nil { return "", err @@ -279,13 +288,43 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, er return "", err } + var pw progressWriter + status := "transferring model data 0%" + spinner.SetMessage(status) + + done := make(chan struct{}) + defer close(done) + + go func() { + ticker := time.NewTicker(60 * time.Millisecond) + defer ticker.Stop() + for { + select { + case <-ticker.C: + spinner.SetMessage(fmt.Sprintf("transferring model data %d%%", int(100*pw.n.Load()/fileSize))) + case <-done: + spinner.SetMessage("transferring model data 100%") + return + } + } + }() + digest := fmt.Sprintf("sha256:%x", hash.Sum(nil)) - if err = client.CreateBlob(cmd.Context(), digest, bin); err != nil { + if err = client.CreateBlob(cmd.Context(), digest, io.TeeReader(bin, &pw)); err != nil { return "", err } return digest, nil } +type progressWriter struct { + n atomic.Int64 +} + +func (w *progressWriter) Write(p []byte) (n int, err error) { + w.n.Add(int64(len(p))) + return len(p), nil +} + func RunHandler(cmd *cobra.Command, args []string) error { interactive := true diff --git a/progress/spinner.go b/progress/spinner.go index 02f3f9fb..e39a45ee 100644 --- a/progress/spinner.go +++ b/progress/spinner.go @@ -3,11 +3,12 @@ package progress import ( "fmt" "strings" + "sync/atomic" "time" ) type Spinner struct { - message string + message atomic.Value messageWidth int parts []string @@ -21,20 +22,25 @@ type Spinner struct { func NewSpinner(message string) *Spinner { s := &Spinner{ - message: message, parts: []string{ "⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏", }, started: time.Now(), } + s.SetMessage(message) go s.start() return s } +func (s *Spinner) SetMessage(message string) { + s.message.Store(message) +} + func (s *Spinner) String() string { var sb strings.Builder - if len(s.message) > 0 { - message := strings.TrimSpace(s.message) + + if message, ok := s.message.Load().(string); ok && len(message) > 0 { + message := strings.TrimSpace(message) if s.messageWidth > 0 && len(message) > s.messageWidth { message = message[:s.messageWidth] } From 6ffb5cb017a1c81970ac637907a8ba6fd151e0e7 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Mon, 3 Jun 2024 15:53:58 -0700 Subject: [PATCH 33/71] add conversion for microsoft phi 3 mini/medium 4k, 128 --- convert/convert.go | 6 + convert/convert_llama.go | 4 - convert/convert_phi3.go | 125 ++++++++++ convert/convert_test.go | 2 + .../testdata/Phi-3-mini-128k-instruct.json | 225 ++++++++++++++++++ llm/ggml.go | 8 + llm/gguf.go | 15 +- 7 files changed, 373 insertions(+), 12 deletions(-) create mode 100644 convert/convert_phi3.go create mode 100644 convert/testdata/Phi-3-mini-128k-instruct.json diff --git a/convert/convert.go b/convert/convert.go index b9461e4f..24c19aa4 100644 --- a/convert/convert.go +++ b/convert/convert.go @@ -27,6 +27,10 @@ func (Parameters) KV(t *Tokenizer) llm.KV { "tokenizer.ggml.token_type": t.Vocabulary.Types, } + if len(t.Merges) > 0 { + kv["tokenizer.ggml.merges"] = t.Merges + } + if t.Template != "" { kv["tokenizer.chat_template"] = t.Template } @@ -89,6 +93,8 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error { conv = &mixtral{} case "GemmaForCausalLM": conv = &gemma{} + case "Phi3ForCausalLM": + conv = &phi3{} default: return errors.New("unsupported architecture") } diff --git a/convert/convert_llama.go b/convert/convert_llama.go index 0383a85e..178b13f3 100644 --- a/convert/convert_llama.go +++ b/convert/convert_llama.go @@ -90,10 +90,6 @@ func (p *llama) KV(t *Tokenizer) llm.KV { kv["llama.attention.value_length"] = p.HeadDim } - if len(t.Merges) > 0 { - kv["tokenizer.ggml.merges"] = t.Merges - } - return kv } diff --git a/convert/convert_phi3.go b/convert/convert_phi3.go new file mode 100644 index 00000000..7aa3ed15 --- /dev/null +++ b/convert/convert_phi3.go @@ -0,0 +1,125 @@ +package convert + +import ( + "cmp" + "encoding/binary" + "io" + "math" + "strings" + "sync" + + "github.com/ollama/ollama/llm" +) + +type phi3 struct { + Parameters + NumHiddenLayers uint32 `json:"num_hidden_layers"` + NLayers uint32 `json:"n_layers"` + HiddenSize uint32 `json:"hidden_size"` + NEmbd uint32 `json:"n_embd"` + IntermediateSize uint32 `json:"intermediate_size"` + NumAttentionHeads uint32 `json:"num_attention_heads"` + NHead uint32 `json:"n_head"` + NumKeyValueHeads uint32 `json:"num_key_value_heads"` + NHeadKV uint32 `json:"n_head_kv"` + RopeTheta float32 `json:"rope_theta"` + RopeScaling struct { + Type string `json:"type"` + LongFactor ropeFactor `json:"long_factor"` + ShortFactor ropeFactor `json:"short_factor"` + } `json:"rope_scaling"` + RMSNormEPS float32 `json:"rms_norm_eps"` + NPositions uint32 `json:"n_positions"` + MaxPositionEmbeddings uint32 `json:"max_position_embeddings"` + OriginalMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"` + SlidingWindow uint32 `json:"sliding_window"` +} + +var _ Converter = (*phi3)(nil) + +func (p *phi3) KV(t *Tokenizer) llm.KV { + kv := p.Parameters.KV(t) + kv["general.architecture"] = "phi3" + kv["general.name"] = "phi3" + kv["phi3.context_length"] = p.MaxPositionEmbeddings + kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd) + kv["phi3.feed_forward_length"] = p.IntermediateSize + kv["phi3.block_count"] = cmp.Or(p.NumHiddenLayers, p.NLayers) + kv["phi3.attention.head_count"] = cmp.Or(p.NumAttentionHeads, p.NHead) + kv["phi3.attention.head_count_kv"] = cmp.Or(p.NumKeyValueHeads, p.NHeadKV) + kv["phi3.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS + kv["phi3.rope.dimension_count"] = p.HiddenSize / cmp.Or(p.NumAttentionHeads, p.NHead) + kv["phi3.rope.freq_base"] = p.RopeTheta + kv["phi3.rope.scaling.original_context_length"] = p.OriginalMaxPositionEmbeddings + kv["phi3.attention.sliding_window"] = p.SlidingWindow + + scale := float64(p.MaxPositionEmbeddings) / float64(p.OriginalMaxPositionEmbeddings) + + switch p.RopeScaling.Type { + case "": + // no scaling + case "su": + kv["phi3.rope.scaling.attn_factor"] = float32(max(math.Sqrt(1+math.Log(scale)/math.Log(float64(p.OriginalMaxPositionEmbeddings))), 1.0)) + case "yarn": + kv["phi3.rope.scaling.attn_factor"] = float32(max(0.1*math.Log(scale)+1.0, 1.0)) + default: + panic("unknown rope scaling type") + } + + return kv +} + +func (p *phi3) Tensors(ts []Tensor) []llm.Tensor { + var addRopeFactors sync.Once + + out := make([]llm.Tensor, 0, len(ts)+2) + for _, t := range ts { + name := p.tensorName(t.Name()) + if strings.HasPrefix(name, "blk.0.") { + addRopeFactors.Do(func() { + out = append(out, llm.Tensor{ + Name: "rope_factors_long.weight", + Kind: 0, + Shape: []uint64{uint64(len(p.RopeScaling.LongFactor))}, + WriterTo: p.RopeScaling.LongFactor, + }, llm.Tensor{ + Name: "rope_factors_short.weight", + Kind: 0, + Shape: []uint64{uint64(len(p.RopeScaling.ShortFactor))}, + WriterTo: p.RopeScaling.ShortFactor, + }) + }) + } + + out = append(out, llm.Tensor{ + Name: name, + Kind: t.Kind(), + Shape: t.Shape(), + WriterTo: t, + }) + } + + return out +} + +func (p *phi3) tensorName(n string) string { + return strings.NewReplacer( + "lm_head", "output", + "model.embed_tokens", "token_embd", + "model.norm", "output_norm", + "model.layers", "blk", + "input_layernorm", "attn_norm", + "self_attn.qkv_proj", "attn_qkv", + "self_attn.o_proj", "attn_output", + "mlp.down_proj", "ffn_down", + "mlp.gate_up_proj", "ffn_up", + "post_attention_layernorm", "ffn_norm", + ).Replace(n) +} + +type ropeFactor []float32 + +func (r ropeFactor) WriteTo(w io.Writer) (int64, error) { + err := binary.Write(w, binary.LittleEndian, r) + return 0, err +} diff --git a/convert/convert_test.go b/convert/convert_test.go index 88f38494..cb2c585e 100644 --- a/convert/convert_test.go +++ b/convert/convert_test.go @@ -65,6 +65,8 @@ func TestConvertFull(t *testing.T) { "Mistral-7B-Instruct-v0.2", "Mixtral-8x7B-Instruct-v0.1", "gemma-2b-it", + // microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8 + "Phi-3-mini-128k-instruct", } for i := range cases { diff --git a/convert/testdata/Phi-3-mini-128k-instruct.json b/convert/testdata/Phi-3-mini-128k-instruct.json new file mode 100644 index 00000000..19296f5a --- /dev/null +++ b/convert/testdata/Phi-3-mini-128k-instruct.json @@ -0,0 +1,225 @@ +{ + "general.architecture": "phi3", + "general.file_type": "1", + "general.quantization_version": "2", + "phi3.block_count": "32", + "phi3.context_length": "131072", + "phi3.embedding_length": "3072", + "phi3.feed_forward_length": "8192", + "phi3.rope.scaling.original_context_length": "4096", + "phi3.rope.dimension_count": "96", + "phi3.rope.freq_base": "10000", + "phi3.rope.scaling.attn_factor": "1.1902381", + "phi3.attention.head_count": "32", + "phi3.attention.head_count_kv": "32", + "phi3.attention.layer_norm_rms_epsilon": "1e-05", + "phi3.attention.sliding_window": "262144", + "tokenizer.ggml.model": "llama", + "tokenizer.ggml.pre": "default", + "tokenizer.ggml.add_bos_token": "false", + "tokenizer.ggml.add_eos_token": "false", + "tokenizer.ggml.bos_token_id": "1", + "tokenizer.ggml.eos_token_id": "32000", + "tokenizer.ggml.unknown_token_id": "0", + "tokenizer.ggml.padding_token_id": "32000", + "tokenizer.ggml.scores": "6e37bcde2adc7e350e87c496eddd7a2124329c1dc66c5bf3ad3997253e4f7a62", + "tokenizer.ggml.token_type": "b6ecf55ec64ee67d87750bdb8d757a2c58bf78377e9f4219f5689a6c4dea57ce", + "tokenizer.ggml.tokens": "d168da3ddd3eee820916945fcb9baf24dd3cde42f606cffa2d19e7c8a8743918", + "blk.0.attn_norm.weight": "216aeb2c9e0c271f899e1ef2a63cceeb8f41e97642e84fada54b1d3c1c11cf25", + "blk.0.attn_output.weight": "b597d56f7188ffc1fafc273fadc59d41738cffd677ae98c61a62c3285b3a3099", + "blk.0.attn_qkv.weight": "d28a6b44e13f59be5483e4be2bedb544e346168d720aca27f47d1a5a722be91e", + "blk.0.ffn_down.weight": "4a691370e5a61fcbbf540fbcbf4c0f1d15dec0364528c0e916d0744f6262b63b", + "blk.0.ffn_norm.weight": "0c00af2b4a3128bec64a0cbb1084b042fdbe13d9ad0d03bd577f9449dfead338", + "blk.0.ffn_up.weight": "b32b52f790c1c083bfb8a3126dc1111cfeeb28dc8c584a930a1e5334cb176bf4", + "blk.1.attn_norm.weight": "68748011503c6c029e8e69a84a8e5a89338f378769627b6dbf7f93d715c292e1", + "blk.1.attn_output.weight": "2267344add13b048ca59e4377c86dc512be8046a57156901fa32a20fa74e4ee0", + "blk.1.attn_qkv.weight": "9109d2e3d7a2eacfda5226587b8be124a3bf44b972da7ebb17aa15795897eacc", + "blk.1.ffn_down.weight": "d675df4df4dd039c0c339ad6445d39eddd2004db6bf35bed6314c7497245a633", + "blk.1.ffn_norm.weight": "3b5767ae977bc8baaa06b06efdbea193b6b3ba605ce76d77a76ce317e935500c", + "blk.1.ffn_up.weight": "80dfd6d9d234b00334c89b8e0a02f81899c2efd377321c34ba5ba51a5f61b5ff", + "blk.2.attn_norm.weight": "6a6743b057e5088f145bc179e92c9bfb41163e7295d7b81c62e23dd89d2b59c4", + "blk.2.attn_output.weight": "bc5491ea54e0db81462d7d9b7d25cbdda380c2db8de041bd1c4ab7b76a1d19c3", + "blk.2.attn_qkv.weight": "a61287a9852e2f5aca9c100b471d98398b2913a3497c743de3c70ec9ddd7087f", + "blk.2.ffn_down.weight": "4fddcc382c8dceeab027fe43d8d44e67edb5e8ce4b9a1b7f773c87770380ade1", + "blk.2.ffn_norm.weight": "07e05f82b3f63f711db3b684ca79aed25c0657917e66f88af47348a82065c227", + "blk.2.ffn_up.weight": "4835a682ef1826c12df01ae7663fc45f9c82bc8e64b665f13fb7da8e201ec0fb", + "blk.3.attn_norm.weight": "f22aba7c03999ba7136f39cda747a39715e498699dc1716cd97fc5dfc58d1b1c", + "blk.3.attn_output.weight": "53b579855366fd786c5126b2b30aac4d583ca7bda56833c4865f5cadb5c18c6d", + "blk.3.attn_qkv.weight": "bb56aba78158123140fcea59c69ac562ca208f6d3086819417cdad8c50f333ad", + "blk.3.ffn_down.weight": "97280897a7cd86db2830c004bccc5bc094f50e293baded0189159a2019145a6e", + "blk.3.ffn_norm.weight": "10a8c99f8b57a960e8e0a1133c4a26f9148403d1b9bff2eff114917de996f3b5", + "blk.3.ffn_up.weight": "7324046c915e75d621b2043597a245a428d8eea31869135e6257a861491d8dcc", + "blk.4.attn_norm.weight": "507d8e164de94646edbfe33def8e8fbf7c9a6ee3fbaedb5000f72d9f51ec5e36", + "blk.4.attn_output.weight": "bbb3429e6efa98c150e0fdbf48c16180cbf0d0cbc1b3c253c6c319d78f4593a2", + "blk.4.attn_qkv.weight": "b95ee5be0786d3901273d806c339fe6c20e6bfffd2a20672a9f56af80921e8ab", + "blk.4.ffn_down.weight": "806bbf91df92a5a22bd5aa1ffb7fc2869f7293ffc7704771c290ecc583b27975", + "blk.4.ffn_norm.weight": "cfc2930a81df7aee3a5e7f726a15c1182233e868bf0d9d37f6b6ae6d8c15c234", + "blk.4.ffn_up.weight": "c3390c69533de2c8424e8069323ccc5d0c4543111535da04cf2c7d26745576aa", + "blk.5.attn_norm.weight": "0d71c4fbcefabbd021569442853d2fe90668b19409ae2805a718a829ca60beab", + "blk.5.attn_output.weight": "10ebd93629112bf2df5c30dd0953a4a5e9020306768283181ed426934d47e14f", + "blk.5.attn_qkv.weight": "5cb05633369f12d4b00e0ff787736bd846856682115720ebc6cce05270c334f6", + "blk.5.ffn_down.weight": "e28bcc5094212eafc7476dbc5b7a520d25b79578cbf4229d698e2655956a80ad", + "blk.5.ffn_norm.weight": "b6f2c4cf9f34bb4d59989f96165c14a67dc1e266ad0a6d0fcc49f1add929e6ff", + "blk.5.ffn_up.weight": "0f9ef99423cc07ebedc0e9cfa95809f2d7108d910bb4ef97ebc0b0309c440750", + "blk.6.attn_norm.weight": "b3edcc47a42218234f7564d7470611b49401a41ae8cd42123f86557c69f5d7f2", + "blk.6.attn_output.weight": "eb9b7d257b388bb5b8fe0515e5c6873317239cb94cda236e4b6ada2a6c57c65c", + "blk.6.attn_qkv.weight": "eb968081f478c52f07bd9c2761741e982dba33cc4eeadeea3557d391b9ac2106", + "blk.6.ffn_down.weight": "1b8588bb7463206290322695577dcfced300895d6e6f4b26966c53a9ae2f0f84", + "blk.6.ffn_norm.weight": "1219c04b7770983c77814200eefe743f46d15328ea2b12711e44f8103eab08d3", + "blk.6.ffn_up.weight": "197ef287239fec47c55677f0fbb66eaf0644f775bc382de843971730721394f6", + "blk.7.attn_norm.weight": "b630ad08c80d564ed1c024384818e9fd3f22a36cd7a14aa96e7e2759a8285099", + "blk.7.attn_output.weight": "970255aa750828a47d6b9d399f9612b5bf25aefe7dadbcba41fc416d0d4067c1", + "blk.7.attn_qkv.weight": "ebb157c880293e6de8d629f263ba8853ed1dbdc02c311d43432bb8cfbb310739", + "blk.7.ffn_down.weight": "24bcd4db4cba844c89f878b81843c373dbbc0675e889d32c5b12e63384a7b670", + "blk.7.ffn_norm.weight": "b9c6f71001808ee873ce7db8056e4b53fb4cccec8b7f0f312899b575fae39d39", + "blk.7.ffn_up.weight": "979f1828d227455c26015a2a11afe9dd05f2bb97a8ba6b38c8dab3f50e627401", + "blk.8.attn_norm.weight": "4e8e347e3775010b7112ee630f2f4f2383be7ff64e6ca6154b9b22566552eaa6", + "blk.8.attn_output.weight": "65a44babf44a435a1829945211b3168f9ec78ac3cb7a049a733e93d11f0d6659", + "blk.8.attn_qkv.weight": "343ed07671da400b040812a4058482fa38284b5d9af9becfed07417fe26ce747", + "blk.8.ffn_down.weight": "7fb7e073e3c2c503c4e9d60efa0988fed7398d900cc003695fe3fffd3e188b82", + "blk.8.ffn_norm.weight": "b07c1f655d8593e3892a2cf73f8a0c19ce8e5cb613fafbe7cbd430da8ce4c57d", + "blk.8.ffn_up.weight": "8b26e14de54b3fdc2e2d3ea41720f9d9c236a93688c3b7fd7bf43f5fbb327c9b", + "blk.9.attn_norm.weight": "46394d408a8e316916177e6aa261de32e137a82d729c0b1800b072f0c38c39b6", + "blk.9.attn_output.weight": "d57f3d46107947a7073373a0b35d6ecf7759b5df15406f4a3590a60666af6b16", + "blk.9.attn_qkv.weight": "14bb8ace8c5453148f4b536e9f4279c813f31136716947256f5cca333448639c", + "blk.9.ffn_down.weight": "2b8d98e2b5ed68338f6e4de43bf7de0c4858cc69103cd5177725f7444eec7694", + "blk.9.ffn_norm.weight": "41a499dfd418cc4c6b8c12313f673f7e2cd4a3f9c4065eb6c4feb5eed02fb542", + "blk.9.ffn_up.weight": "143aab7533a64b17fbe201490a6f674bc7f0bd370c094500b2e100419073d1c2", + "blk.10.attn_norm.weight": "ebb670aafd36816a794347287269d8f1a5b19c1e3c0a1e38023bc19fdba9b073", + "blk.10.attn_output.weight": "b5d65bbc0ed5e49fdd9d754bc18163cd042a285024d0cf6f954c503bc8c877cb", + "blk.10.attn_qkv.weight": "f06b15bac88da798fa34a62b03eaac0dbe8b846020516603c387541f2d8dd672", + "blk.10.ffn_down.weight": "fb091fcd1b4de25d1bea94d1755e255cb02914a030d23e3a234e57b8d46bde6e", + "blk.10.ffn_norm.weight": "eb347bdf9c40414af87e13a8e72e40b31f004b50f7cb366f1a219ced60a61355", + "blk.10.ffn_up.weight": "ed2d52fc881a173f404fe8a1067862c9856d6c3e0d2e90a330a7aa394e3f84d1", + "blk.11.attn_norm.weight": "64e252603cf010a0e502ca39fdf8d0a196a79aec67c0d2bb9213fc0cb80c47d4", + "blk.11.attn_output.weight": "228e33e21c69f52efc74fdfc831bc9af271e44b2a29a3dced1d64e667ce36eb5", + "blk.11.attn_qkv.weight": "ab9ce6d4ef9e42ee0da3f20a7708a3bbc5e79e967b05fa86ba946a05e2eb63eb", + "blk.11.ffn_down.weight": "0ca133b7835c98dc77c25d64e4eb7873778bdb5e4d22d8b80f920f46865b43bd", + "blk.11.ffn_norm.weight": "02455741a0dfd161c79aa1ecc381901721f229fdcda5615622a629631fb61cfd", + "blk.11.ffn_up.weight": "9fecdcc099fbb8e23c6b1ea9294702a027f4a58d265543ec5e7be79b8f63b354", + "blk.12.attn_norm.weight": "783bb459911b1b3609a9b2bdfe272f1670add73b5471da738e07ac47e2e07dfd", + "blk.12.attn_output.weight": "1e1a914c9e48b857206ac5a1f7cead994bc1ea91d5d4fff8c834d73f2e38ef5d", + "blk.12.attn_qkv.weight": "5953e7185ccb87fb4dae8f9426ec86315d4c7794326e8ab59b3a95d4af2189f0", + "blk.12.ffn_down.weight": "a3eecf0f394f86e2cfb48a5940a5c50ca86d71883b2f79fcc642a935fabce0d4", + "blk.12.ffn_norm.weight": "0a4272e41373c23bd72f10d2d82930aa3a1480aac75832bfbf01cebf0b86b6a4", + "blk.12.ffn_up.weight": "06f42776de3a7ceac3025f26a7a8bd20e062233cce2bdaa2183470dc4b30b87d", + "blk.13.attn_norm.weight": "5915da60fb03e201fa649faba780e5fdf1c761c262b206e5415cf83181f65780", + "blk.13.attn_output.weight": "4dbf6eab074fa3835fd32bd631a8208e511037d5056d2fd3015735cca7674ef7", + "blk.13.attn_qkv.weight": "d3d8339a1c4782d9e73d77fdebe154d3c5b83ac40c9175b3e91a4977d08f876b", + "blk.13.ffn_down.weight": "de6772b46a55e1fd42b007637dfbf68b6598e5d5b61622da0935002e1e192d3a", + "blk.13.ffn_norm.weight": "5a640ea3b8c7be49c95a58a2327e10d8e8d9d142504bde5c8091613e5b961d7a", + "blk.13.ffn_up.weight": "f35e3545e4bd3531b2e843b5efd31dee0c13c807ee6386e65473ba67bbec30d0", + "blk.14.attn_norm.weight": "9b34986450b7c98b4927e81e61a816f9e84b1addc7c14926402100037aad6678", + "blk.14.attn_output.weight": "155d52efb23d366016d861a251d4d1f4a0c13699188c50d50dba016a0d8bfcd9", + "blk.14.attn_qkv.weight": "8e1415084e1f33c73a777f19e752489f4dd312cca047733e5ea643cd4a955e04", + "blk.14.ffn_down.weight": "a2a142226b94baa01ccb65bdea2b7418e49085c1d9c3c63e544e3112c58a25da", + "blk.14.ffn_norm.weight": "8aecfd9b0ae6affaea31a80c5c9a4a14b31deaa0db7bd8f6da2a64d23447921c", + "blk.14.ffn_up.weight": "0c1407237b8c1bd02f193346b5681926fe698a5055eac6a7450451b0f991707c", + "blk.15.attn_norm.weight": "e037bd19880bfa83d983200fb0c7866f8ad16c3ff5cc4b4f3a37ca7373870ff6", + "blk.15.attn_output.weight": "045fe4fc95cc129a1b92771b179c11b12845c4c088786c607f17bd98857e68e1", + "blk.15.attn_qkv.weight": "7621b7559705cab1d4dea1c69f76dbf9dc1c8837a203b656f484703b9c1b70ce", + "blk.15.ffn_down.weight": "7e5ac20e290bc60761e1cd972354fde225b7fa861048d44d9a0dd9b046d55f58", + "blk.15.ffn_norm.weight": "b6d830d88f1db1825687973c8c2b1a24c6fa84f07af8d0e3ef9c86009baca0b2", + "blk.15.ffn_up.weight": "dcda0957cd04fc45476774dba2bbf9aa89d6b05d5ca7b10ae6f73ad2c49b1cd3", + "blk.16.attn_norm.weight": "4ee9b70ba15cb2a08240f93990e90f5068c48fceb481f8e2186bec8b7214eb3f", + "blk.16.attn_output.weight": "315cfe5536658d2498192b2980eade15b2c9a4ff220e4011911457b1727fa103", + "blk.16.attn_qkv.weight": "3c8122e3ad637583b9dcde8ff3a323267d3014bb1f0f9771e5322260ca9ecc8d", + "blk.16.ffn_down.weight": "3b5fbebd5ee2b86cad96fb8a9b45a8770d08f82c1c8b74d7061e866f7020a18d", + "blk.16.ffn_norm.weight": "ffab69f20bda372de6e5878f0539163e2fc6ba113621ded95705fc3b1465c9f0", + "blk.16.ffn_up.weight": "0935ea3d258da42d6258406365f39f58ddaabfe97ea5977580db3635188f24a1", + "blk.17.attn_norm.weight": "f030441733f3d147b4a06a1eb4aeb8465c7c24d9c53bf4c48fe7e134d3629803", + "blk.17.attn_output.weight": "07a955ef09e8dc766ac0df647d0b2c69f23c4c69a7137654b4aad80303ed0eda", + "blk.17.attn_qkv.weight": "1c10688061e21e2fe12ad0cb54bf03895c1f83c3b0df743a42f548b52cbca1b2", + "blk.17.ffn_down.weight": "ebb9cc9836f41d88fdae2aa9a4355514e4edaec8d1577ffeb947a35204e77f52", + "blk.17.ffn_norm.weight": "50aff44f6528b13db5389f2ddcdb7676244947610bd7ffbff3f881c968c2a0d4", + "blk.17.ffn_up.weight": "d716537949582be33bde6b02e38f5a70081c9642a9fb05a61312126718b8d148", + "blk.18.attn_norm.weight": "0ea695c4e53d637902f46663a6ee42adc493c36794476acc7dbddaa05b13840d", + "blk.18.attn_output.weight": "5fd35b500221a612eb4f4bddf0e9b6b7db4d7733032a75f8802fb2d884647c2e", + "blk.18.attn_qkv.weight": "b0da37fd030fe69581f990bf23bfd35467a1bbe558af6de7c0924f6b72e92317", + "blk.18.ffn_down.weight": "b355c33f44b328f4bb977567de8f7544db4b005d7a8fbded658518ecf3c5a153", + "blk.18.ffn_norm.weight": "58b3fe9094079989a86e0387143259e1cc35952d24dc3df290c4ba6df44f5c51", + "blk.18.ffn_up.weight": "2ce530954c342c30ed2ead5353f931960bfae1d278868504c0efb973560fabbe", + "blk.19.attn_norm.weight": "533e9aed66feea8f0392aa81f9e293240e1f009a5334253915fb60c2749b615d", + "blk.19.attn_output.weight": "84f2d00f98a4113a779d3b5d1c3e7c914eb47784d3ab13b290367c124c2994aa", + "blk.19.attn_qkv.weight": "fbe6b9f53b07fa7537d3b3d452d20a9bc666f9fd41ec2091dd28bc2f70fc668f", + "blk.19.ffn_down.weight": "b30199e098c8bb3f890183d8b18471e80b62b604729b277ad62488dd71e1206b", + "blk.19.ffn_norm.weight": "c81373e41cd340b7badb19f9517c77c4250b4eb9a02dc758b8b49b652487d7ff", + "blk.19.ffn_up.weight": "5a5cb083ca7725720e3a890f7fa46354760e8007a8188849a092e305694a75e3", + "blk.20.attn_norm.weight": "4953091b4477e354357a8e743ba0a1900633e52f1599ee082a0c9b0b2b5cd978", + "blk.20.attn_output.weight": "62d54f7749cd6856097b2632066a322b0296df915fe66f382c5b5981be0d4f23", + "blk.20.attn_qkv.weight": "406de9e35b0729ebe902d7a47905cc7fb29a921431ed35dbef0c03e5690a1329", + "blk.20.ffn_down.weight": "62fb678b0d1261e19a4903a2b347d67afcc8acff01feb33a687a35a2d1e6f9a5", + "blk.20.ffn_norm.weight": "cd9d36b7e71e55c8925b97bb09c28219f182626bcff094878ae39c3db887a14b", + "blk.20.ffn_up.weight": "b9276771d79d3e932e73ccc520c3f8476342b9ef312ed2ee1e0da822e6e3ad18", + "blk.21.attn_norm.weight": "66d8c8a35e13ce9c2a0e75b670150e2c31484a55c2316df46075312196178ed3", + "blk.21.attn_output.weight": "12ab46c9382648f9b3350fdd92a6be6352743d62d6b520d7e2024e0c838588f5", + "blk.21.attn_qkv.weight": "a7909676ee1675ca23cd29a5fdd226df8dd9d68f94c6c9bbb51dd9fd38504008", + "blk.21.ffn_down.weight": "6fb317279c6542e82f97d5a12a60fac1bd0fa0405154f9fbe265e2fe39bd49cc", + "blk.21.ffn_norm.weight": "c0f703eb3ff161b5ba4490d87d8684b8a6c47a8f433e12f418333b9db439010a", + "blk.21.ffn_up.weight": "6dbdb80ef0c35e364bbce12d40d5e74c7963c7b55d58d9579567a07ffce7b863", + "blk.22.attn_norm.weight": "f94237433bf03d675cb2f655b81ca91a1ce2447bc6b00b13d6b0ccfe2d411eff", + "blk.22.attn_output.weight": "e821f95995ce497c01e63ca64f737713b1b65f11df1903e51d444aa516f33f71", + "blk.22.attn_qkv.weight": "1b0f717c73afb5eb4c82a1708c4e85c969e8a2a8770d9ddb78b1870a2d8a781e", + "blk.22.ffn_down.weight": "0f33f7a3cdc685484be99aa0c03642b0b20850a27d1fddbe054b13a9382f3ccb", + "blk.22.ffn_norm.weight": "9df285cf211ddd7df2b36a50489af574755c7d4d98b29a05cd04566ae613c8dc", + "blk.22.ffn_up.weight": "63ac300e1efb34041dd0136cf43ea622fac6f0caccce1cd9262f5e08d2cf179c", + "blk.23.attn_norm.weight": "5f72d9e88689b4027b28f5f8f26cd3abb03635ceea7ec98a4c91a9fc691f6707", + "blk.23.attn_output.weight": "6ecf04ff61125c5fc768f8656497152149373daf321ee9c957e8f7245a1184d1", + "blk.23.attn_qkv.weight": "a9d9978806724c2959f2cf386c233831f08e1e933dbf2b32665e788d9d512ea4", + "blk.23.ffn_down.weight": "72c7d17886a3da17fa0daa456aa5e877b2ef5b8b403182b870d9ca5ca9c70347", + "blk.23.ffn_norm.weight": "971e4b712e3025a13419b5b57d674b5e4ab7f18f74b57b9afc4671623da90c4b", + "blk.23.ffn_up.weight": "df2b5c7dbd5834545b815073af0c7355b065124e6d6f0fee78d8fa5b2076dc3e", + "blk.24.attn_norm.weight": "c41957c4a79ad3b16f6e11daec1c7f530b9f3f4b618e1e4367c3b67787ac4ab6", + "blk.24.attn_output.weight": "ef7d61f5fc88ac6f31bf60cb5f4d2d6b8df42d38825807112361a7224b0dee3b", + "blk.24.attn_qkv.weight": "3e6a58fe7d49c90bb6971efbad3371c32256881173ea5aee4b0c296cb206490f", + "blk.24.ffn_down.weight": "f43619144047de42fed81dfa495f1815d3cb771330e574043e2b67620819292c", + "blk.24.ffn_norm.weight": "5501d4a2a98c8ca6b42e77b53b221dbc08f530f6a067256d787534ec6fe028bd", + "blk.24.ffn_up.weight": "d64c8b0e509e2b1118f6000176f8956cacecdbb200c7e95ed93fb78b6e26c84a", + "blk.25.attn_norm.weight": "502fa3c302d371f61c5791f4615b73018ffb1daa09b6499b227116581244c5d4", + "blk.25.attn_output.weight": "ad8391d4e9c980856f2547aa945b2b6a407a6382158dc1ddd4f08d94ecc24be6", + "blk.25.attn_qkv.weight": "42e8983780d4a01a02c54ad23d4df21eea437f119a10af5a9c12a76a42d308c1", + "blk.25.ffn_down.weight": "302dd010d4e0ab4eeaee89090409ea0dddeeeed3236415eb8f97c942497eea91", + "blk.25.ffn_norm.weight": "fb34c1ee5bca96986c08834df0a0c047ba041c1123ac1f563e9d64312bf82d6a", + "blk.25.ffn_up.weight": "10739a8de156816d93c92b935386540bfa976bdbef204f0312960f6fc657582f", + "blk.26.attn_norm.weight": "7036c711609128c4e55968ff3681d3043338879a5737efd6c2ac9e1a2a61f1a0", + "blk.26.attn_output.weight": "db5db45dead5cb911fa01da59832f121b7c18b2d167bf53741c40819f24d346c", + "blk.26.attn_qkv.weight": "cae34c6b7f82ed14348d5ed30a79919c383737c1694a9cb9c0de609d3b0c1d0a", + "blk.26.ffn_down.weight": "491ec3a4da9b4f49f8ebc6be658ce397a9b801ae9fb35e82177e47808c65e5d0", + "blk.26.ffn_norm.weight": "fd7059d75d7f0e5288511ddeeb0f772eb3cae3ccfe4226b877015834edc3c386", + "blk.26.ffn_up.weight": "ea1ee1274c56458ce056d2205e5bb6e5422ce4cb0ad58006b8141749b97a0c39", + "blk.27.attn_norm.weight": "cc362c9a937609265052cd38544af17a1a7448cea086d4c801139e1fc865832d", + "blk.27.attn_output.weight": "ba757a81dabde9cb1b069d1bb616fe79649a1724f756567ec61caed1304fe6cf", + "blk.27.attn_qkv.weight": "1ab8d7d02d87756c12c2275636823aa5ede3d683178225c4cac4bd892c319bd4", + "blk.27.ffn_down.weight": "deb1c711c8a66acf4dcd2d088e1548f8e08f296f755e4067d6557fa55afde88c", + "blk.27.ffn_norm.weight": "fc6242d8cb8a4a37a8ddb7e41e7e60a63d4a89edf36acb35df052f10b9c91ece", + "blk.27.ffn_up.weight": "8df39b09c4801f343aca78f2918a1f6db78c8c55e591eda4c69eadb74c26e180", + "blk.28.attn_norm.weight": "75b539308f77e3cefdc6d98484d8b5cbf0538f0c2869a77b7373a145a18bc850", + "blk.28.attn_output.weight": "ae128940eb60a6d2e121762ef4b3e9dcf9eb3e105b249507fa7f12de0e19822c", + "blk.28.attn_qkv.weight": "bdda781c288e9326c240e33905f8e621b6a2ad902e620739d34f93fcd6f933de", + "blk.28.ffn_down.weight": "f1d6e6d1c286b1138bfd7e53fe477f399ae93bc2c04e35416f84218ed7247965", + "blk.28.ffn_norm.weight": "3f837ce82c8b9bde0d61d08b6f5fe5574886ea5328dbdc53f2929f18da8b4087", + "blk.28.ffn_up.weight": "2af027002e31d1b6cfedbdb30a2b9d7213f3aa691167c353913adfd48fda31e4", + "blk.29.attn_norm.weight": "61e8003b5329462ffe0fe172f2b160260de006aed858332d49d75504b6b6aa7a", + "blk.29.attn_output.weight": "ca44542a72a37476dc73dbdcc01f5b7497cb3ebc4ea230a55c9634ccd8e56ad4", + "blk.29.attn_qkv.weight": "abb3d9d6abe57872ae3daa51935d43264093ded5ce63b49d1e280ee5758be0e4", + "blk.29.ffn_down.weight": "6764b895fce881df097489c263446f0106de36217997660c15984b3ee22a5a06", + "blk.29.ffn_norm.weight": "89e03e9a33fc0e6e31ba9f0c2bd7c5734a118c5602bb90148793e08a80e8d0ae", + "blk.29.ffn_up.weight": "fa7ad57a84954f4121653152efed1a871d8adb20a1ea9086e3e849ce359d7d2e", + "blk.30.attn_norm.weight": "91a697aca1e42af54f806a20211031c3369e8d0bd58df1b0147fe24954e1f5a4", + "blk.30.attn_output.weight": "36063fcf766c89ac75be56f688cc63cefe5f2c733fbf4378ea9956ad386fa148", + "blk.30.attn_qkv.weight": "2cacd1161f1121a2c0b979930134f4666f73fb8d7237b3b0659ae091b15955a6", + "blk.30.ffn_down.weight": "9f3fcb6217100595850c05dc98f9ab2a263afdb6ab28df2fcb08aeff512057d7", + "blk.30.ffn_norm.weight": "6c600bc1fc7de39d4f8917b81fc7d1d5ed2a9b56492234c13a4bd6028c30d880", + "blk.30.ffn_up.weight": "73cabd1bb011956b2689ea3338bb76642ef3a57c197377d666d2ab5f56317668", + "blk.31.attn_norm.weight": "72d3e1cc771380645fa75a899858c95f39857a4f3f1ed60fe1578df383b8bc53", + "blk.31.attn_output.weight": "40089cdd29994dc19a1d89fa15902a89cfeca3540f12dc9bf4d00ef82506e456", + "blk.31.attn_qkv.weight": "1d0bb40e9258071ae14290a53c619a8e331dda07354d2a02ef45766c029ae5e4", + "blk.31.ffn_down.weight": "8defa0e06335b793fa8be03883f0a322d6c5b33f52c69c943c35c60d16e42c0a", + "blk.31.ffn_norm.weight": "33c55d9d0c496ccfb130361fe131649346e098abaaac39c0519507e5d846721d", + "blk.31.ffn_up.weight": "599f6503f61c692c1f82001973d35119f9688db5e6be9d9c298411491c93f09b", + "output.weight": "14b8dc662bfa3308ebb2e102c562d8e52c15670e538f20f3216a9c310ca9dd41", + "output_norm.weight": "7f2294ba94ce65681df6c7ddd8698799199b9d77dc83c10bdad5c3999f0fdb82", + "rope_factors_long.weight": "e34d378664e354652c38f47d10dafb0498ccc2fb042d39ff7fef768146fff22b", + "rope_factors_short.weight": "9379146a4988f373d362fe47b06c75e7fe7c54aa4dc9558758df79b7a87471fd", + "token_embd.weight": "19a03c1fb5ac0baee93b0a7d8b0f26e9a9b011e229b694afc50ebfc13d84f8bf" +} diff --git a/llm/ggml.go b/llm/ggml.go index d7f2eef7..4c68adf9 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -157,6 +157,14 @@ type Tensor struct { io.WriterTo `json:"-"` } +func (t Tensor) block() (n int) { + if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil { + return -1 + } + + return +} + func (t Tensor) blockSize() uint64 { switch t.Kind { case 0, 1, 24, 25, 26, 27, 28, 30: // F32, F16, I8, I16, I32, I64, F64, BF16 diff --git a/llm/gguf.go b/llm/gguf.go index 98158313..2e6bc542 100644 --- a/llm/gguf.go +++ b/llm/gguf.go @@ -532,15 +532,14 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error { } } - slices.SortFunc(ts, func(a, b Tensor) int { - var i, j int - if n, err := fmt.Sscanf(a.Name, "blk.%d", &i); err != nil || n != 1 { - return cmp.Compare(a.Name, b.Name) - } else if n, err := fmt.Sscanf(b.Name, "blk.%d", &j); err != nil || n != 1 { - return cmp.Compare(a.Name, b.Name) + slices.SortStableFunc(ts, func(a, b Tensor) int { + if i, j := a.block(), b.block(); i < 0 && j > 0 { + return 1 + } else if i > 0 && j < 0 { + return -1 + } else { + return cmp.Compare(i, j) } - - return cmp.Compare(i, j) }) var s uint64 From aec77d6a05c3cd13732eab7decc9794bbed670d9 Mon Sep 17 00:00:00 2001 From: Bruce MacDonald Date: Tue, 2 Jul 2024 14:40:01 -0700 Subject: [PATCH 34/71] support new "longrope" attention factor --- convert/convert_phi3.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert/convert_phi3.go b/convert/convert_phi3.go index 7aa3ed15..0f645217 100644 --- a/convert/convert_phi3.go +++ b/convert/convert_phi3.go @@ -58,7 +58,7 @@ func (p *phi3) KV(t *Tokenizer) llm.KV { switch p.RopeScaling.Type { case "": // no scaling - case "su": + case "su", "longrope": kv["phi3.rope.scaling.attn_factor"] = float32(max(math.Sqrt(1+math.Log(scale)/math.Log(float64(p.OriginalMaxPositionEmbeddings))), 1.0)) case "yarn": kv["phi3.rope.scaling.attn_factor"] = float32(max(0.1*math.Log(scale)+1.0, 1.0)) From bd5e432630a0c1d1ca5795052355a45014e71a2a Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Mon, 5 Aug 2024 10:30:32 -0700 Subject: [PATCH 35/71] update import.md --- docs/import.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/import.md b/docs/import.md index f34f09ac..82ea9ba5 100644 --- a/docs/import.md +++ b/docs/import.md @@ -16,7 +16,9 @@ If the model being imported is one of these architectures, it can be imported di - LlamaForCausalLM - MistralForCausalLM + - MixtralForCausalLM - GemmaForCausalLM + - Phi3ForCausalLM ```dockerfile FROM /path/to/safetensors/directory From 8b00a415ab5170a5a75b105402ca262d1fb7ac12 Mon Sep 17 00:00:00 2001 From: royjhan <65097070+royjhan@users.noreply.github.com> Date: Tue, 13 Aug 2024 13:19:56 -0400 Subject: [PATCH 36/71] Load Embedding Model on Empty Input (#6325) * load on empty input * no load on invalid input --- server/routes.go | 16 +++++----- server/routes_test.go | 70 ------------------------------------------- 2 files changed, 9 insertions(+), 77 deletions(-) diff --git a/server/routes.go b/server/routes.go index e5a31002..6c470c17 100644 --- a/server/routes.go +++ b/server/routes.go @@ -324,13 +324,10 @@ func (s *Server) EmbedHandler(c *gin.Context) { input = append(input, v.(string)) } default: - c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "invalid input type"}) - return - } - - if len(input) == 0 { - c.JSON(http.StatusOK, api.EmbedResponse{Model: req.Model, Embeddings: [][]float32{}}) - return + if req.Input != nil { + c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "invalid input type"}) + return + } } r, m, opts, err := s.scheduleRunner(c.Request.Context(), req.Model, []Capability{}, req.Options, req.KeepAlive) @@ -341,6 +338,11 @@ func (s *Server) EmbedHandler(c *gin.Context) { checkpointLoaded := time.Now() + if len(input) == 0 { + c.JSON(http.StatusOK, api.EmbedResponse{Model: req.Model, Embeddings: [][]float32{}}) + return + } + kvData, err := getKVData(m.ModelPath, false) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) diff --git a/server/routes_test.go b/server/routes_test.go index ef7248ef..242875d6 100644 --- a/server/routes_test.go +++ b/server/routes_test.go @@ -272,76 +272,6 @@ func Test_Routes(t *testing.T) { assert.Equal(t, "library", retrieveResp.OwnedBy) }, }, - { - Name: "Embed Handler Empty Input", - Method: http.MethodPost, - Path: "/api/embed", - Setup: func(t *testing.T, req *http.Request) { - embedReq := api.EmbedRequest{ - Model: "t-bone", - Input: "", - } - jsonData, err := json.Marshal(embedReq) - require.NoError(t, err) - req.Body = io.NopCloser(bytes.NewReader(jsonData)) - }, - Expected: func(t *testing.T, resp *http.Response) { - contentType := resp.Header.Get("Content-Type") - if contentType != "application/json; charset=utf-8" { - t.Fatalf("expected content type application/json; charset=utf-8, got %s", contentType) - } - body, err := io.ReadAll(resp.Body) - if err != nil { - t.Fatal(err) - } - - var embedResp api.EmbedResponse - err = json.Unmarshal(body, &embedResp) - if err != nil { - t.Fatal(err) - } - - if embedResp.Model != "t-bone" { - t.Fatalf("expected model t-bone, got %s", embedResp.Model) - } - - if embedResp.Embeddings == nil { - t.Fatalf("expected embeddings to not be nil, got %v", embedResp.Embeddings) - } - - if len(embedResp.Embeddings) != 0 { - t.Fatalf("expected embeddings to be empty, got %v", embedResp.Embeddings) - } - }, - }, - { - Name: "Embed Handler Invalid Input", - Method: http.MethodPost, - Path: "/api/embed", - Setup: func(t *testing.T, req *http.Request) { - embedReq := api.EmbedRequest{ - Model: "t-bone", - Input: 2, - } - jsonData, err := json.Marshal(embedReq) - require.NoError(t, err) - req.Body = io.NopCloser(bytes.NewReader(jsonData)) - }, - Expected: func(t *testing.T, resp *http.Response) { - contentType := resp.Header.Get("Content-Type") - if contentType != "application/json; charset=utf-8" { - t.Fatalf("expected content type application/json; charset=utf-8, got %s", contentType) - } - _, err := io.ReadAll(resp.Body) - if err != nil { - t.Fatal(err) - } - - if resp.StatusCode != http.StatusBadRequest { - t.Fatalf("expected status code 400, got %d", resp.StatusCode) - } - }, - }, } t.Setenv("OLLAMA_MODELS", t.TempDir()) From feedf49c717a449cedbf973b06ca97796cfaa004 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 13 Aug 2024 11:44:50 -0700 Subject: [PATCH 37/71] Go back to a pinned Go version Go version 1.22.6 is triggering AV false positives, so go back to 1.22.5 --- .github/workflows/release.yaml | 10 +++++----- .github/workflows/test.yaml | 10 +++++----- go.mod | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index f0c6db5d..5ae630c3 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -31,7 +31,7 @@ jobs: security set-keychain-settings -lut 3600 build.keychain - uses: actions/setup-go@v5 with: - go-version: "stable" + go-version-file: go.mod cache: true - name: Build Darwin env: @@ -87,7 +87,7 @@ jobs: write-host "plugin installed" - uses: actions/setup-go@v5 with: - go-version: "stable" + go-version-file: go.mod cache: true - run: go get ./... - run: | @@ -141,7 +141,7 @@ jobs: write-host "plugin installed" - uses: actions/setup-go@v5 with: - go-version: "stable" + go-version-file: go.mod cache: true - name: 'Install ROCm' run: | @@ -218,7 +218,7 @@ jobs: write-host "plugin installed" - uses: actions/setup-go@v5 with: - go-version: "stable" + go-version-file: go.mod cache: true - name: 'Install CUDA' run: | @@ -306,7 +306,7 @@ jobs: write-host "plugin installed" - uses: actions/setup-go@v5 with: - go-version: "stable" + go-version-file: go.mod cache: true - run: go get - uses: actions/download-artifact@v4 diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index a57d45fd..3d58fa3e 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -63,7 +63,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-go@v5 with: - go-version: "stable" + go-version-file: go.mod cache: true - run: go get ./... - run: | @@ -163,7 +163,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-go@v5 with: - go-version: "stable" + go-version-file: go.mod cache: true - name: 'Install ROCm' run: | @@ -200,7 +200,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-go@v5 with: - go-version: "stable" + go-version-file: go.mod cache: true - name: 'Install CUDA' run: | @@ -255,7 +255,7 @@ jobs: submodules: recursive - uses: actions/setup-go@v5 with: - go-version: "stable" + go-version-file: go.mod cache: false - run: | case ${{ matrix.arch }} in @@ -297,7 +297,7 @@ jobs: submodules: recursive - uses: actions/setup-go@v5 with: - go-version: "stable" + go-version-file: go.mod cache: true - run: | case ${{ matrix.arch }} in diff --git a/go.mod b/go.mod index 2e0c6614..6e437c73 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/ollama/ollama -go 1.22.0 +go 1.22.5 require ( github.com/containerd/console v1.0.3 From 1f32276178d5860bbaeafb7dd73d4ef93053bc15 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Tue, 13 Aug 2024 13:36:05 -0700 Subject: [PATCH 38/71] Update openai.md to remove extra checkbox (#6345) --- docs/openai.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/openai.md b/docs/openai.md index 7b3a3f31..75d2c595 100644 --- a/docs/openai.md +++ b/docs/openai.md @@ -182,7 +182,6 @@ curl http://localhost:11434/v1/embeddings \ - [x] Reproducible outputs - [x] Vision - [x] Tools (streaming support coming soon) -- [ ] Vision - [ ] Logprobs #### Supported request fields From 2697d7f5aad27248aebbe5acff1dcbede5367b7b Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Tue, 13 Aug 2024 13:40:37 -0700 Subject: [PATCH 39/71] lint - fixes printf: non-constant format string in call to fmt.Printf - fixes SA1032: arguments have the wrong order - disables testifylint --- .golangci.yaml | 1 - readline/buffer.go | 79 ++++++++++++++++---------------------------- readline/readline.go | 2 +- readline/types.go | 61 ++++++++++++++++++++++------------ server/sched.go | 2 +- 5 files changed, 70 insertions(+), 75 deletions(-) diff --git a/.golangci.yaml b/.golangci.yaml index c2c8b52b..c9c9f620 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -24,7 +24,6 @@ linters: - nosprintfhostport - staticcheck - tenv - - testifylint - unconvert - unused - usestdlibvars diff --git a/readline/buffer.go b/readline/buffer.go index 68573d40..d91fe0a9 100644 --- a/readline/buffer.go +++ b/readline/buffer.go @@ -62,7 +62,7 @@ func (b *Buffer) MoveLeft() { rLength := runewidth.RuneWidth(r) if b.DisplayPos%b.LineWidth == 0 { - fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width)) + fmt.Print(CursorUp + CursorBOL + CursorRightN(b.Width)) if rLength == 2 { fmt.Print(CursorLeft) } @@ -74,7 +74,7 @@ func (b *Buffer) MoveLeft() { fmt.Print(CursorLeft) } } else { - fmt.Print(cursorLeftN(rLength)) + fmt.Print(CursorLeftN(rLength)) } b.Pos -= 1 @@ -115,15 +115,15 @@ func (b *Buffer) MoveRight() { b.DisplayPos += rLength if b.DisplayPos%b.LineWidth == 0 { - fmt.Printf(CursorDown + CursorBOL + cursorRightN(len(b.Prompt.prompt()))) + fmt.Print(CursorDown + CursorBOL + CursorRightN(len(b.Prompt.prompt()))) } else if (b.DisplayPos-rLength)%b.LineWidth == b.LineWidth-1 && hasSpace { - fmt.Printf(CursorDown + CursorBOL + cursorRightN(len(b.Prompt.prompt())+rLength)) + fmt.Print(CursorDown + CursorBOL + CursorRightN(len(b.Prompt.prompt())+rLength)) b.DisplayPos += 1 } else if b.LineHasSpace.Size() > 0 && b.DisplayPos%b.LineWidth == b.LineWidth-1 && hasSpace { - fmt.Printf(CursorDown + CursorBOL + cursorRightN(len(b.Prompt.prompt()))) + fmt.Print(CursorDown + CursorBOL + CursorRightN(len(b.Prompt.prompt()))) b.DisplayPos += 1 } else { - fmt.Print(cursorRightN(rLength)) + fmt.Print(CursorRightN(rLength)) } } } @@ -154,7 +154,7 @@ func (b *Buffer) MoveToStart() { fmt.Print(CursorUp) } } - fmt.Printf(CursorBOL + cursorRightN(len(b.Prompt.prompt()))) + fmt.Print(CursorBOL + CursorRightN(len(b.Prompt.prompt()))) b.Pos = 0 b.DisplayPos = 0 } @@ -169,9 +169,9 @@ func (b *Buffer) MoveToEnd() { fmt.Print(CursorDown) } remainder := b.DisplaySize() % b.LineWidth - fmt.Printf(CursorBOL + cursorRightN(len(b.Prompt.prompt())+remainder)) + fmt.Print(CursorBOL + CursorRightN(len(b.Prompt.prompt())+remainder)) } else { - fmt.Print(cursorRightN(b.DisplaySize() - b.DisplayPos)) + fmt.Print(CursorRightN(b.DisplaySize() - b.DisplayPos)) } b.Pos = b.Buf.Size() @@ -286,8 +286,7 @@ func (b *Buffer) drawRemaining() { remLength := runewidth.StringWidth(remainingText) if len(currLine) > 0 { - fmt.Printf(ClearToEOL + currLine) - fmt.Print(cursorLeftN(currLineSpace)) + fmt.Print(ClearToEOL + currLine + CursorLeftN(currLineSpace)) } else { fmt.Print(ClearToEOL) } @@ -301,9 +300,9 @@ func (b *Buffer) drawRemaining() { } if (b.DisplayPos+currLineSpace)%b.LineWidth == 0 && currLine == remainingText { - fmt.Print(cursorRightN(currLineSpace)) + fmt.Print(CursorRightN(currLineSpace)) fmt.Printf("\n%s", b.Prompt.AltPrompt) - fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width-currLineSpace)) + fmt.Print(CursorUp + CursorBOL + CursorRightN(b.Width-currLineSpace)) } // render the other lines @@ -333,9 +332,7 @@ func (b *Buffer) drawRemaining() { lineLength += runewidth.RuneWidth(c) fmt.Printf("%c", c) } - fmt.Print(ClearToEOL) - fmt.Print(cursorUpN(totalLines)) - fmt.Printf(CursorBOL + cursorRightN(b.Width-currLineSpace)) + fmt.Print(ClearToEOL + CursorUpN(totalLines) + CursorBOL + CursorRightN(b.Width-currLineSpace)) hasSpace := b.GetLineSpacing(b.DisplayPos / b.LineWidth) @@ -357,8 +354,7 @@ func (b *Buffer) Remove() { if b.DisplayPos%b.LineWidth == 0 { // if the user backspaces over the word boundary, do this magic to clear the line // and move to the end of the previous line - fmt.Printf(CursorBOL + ClearToEOL) - fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width)) + fmt.Print(CursorBOL + ClearToEOL + CursorUp + CursorBOL + CursorRightN(b.Width)) if b.DisplaySize()%b.LineWidth < (b.DisplaySize()-rLength)%b.LineWidth { b.LineHasSpace.Remove(b.DisplayPos/b.LineWidth - 1) @@ -370,24 +366,23 @@ func (b *Buffer) Remove() { } if rLength == 2 { - fmt.Print(CursorLeft + " " + cursorLeftN(2)) + fmt.Print(CursorLeft + " " + CursorLeftN(2)) } else { fmt.Print(" " + CursorLeft) } } else if (b.DisplayPos-rLength)%b.LineWidth == 0 && hasSpace { - fmt.Printf(CursorBOL + ClearToEOL) - fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width)) + fmt.Print(CursorBOL + ClearToEOL + CursorUp + CursorBOL + CursorRightN(b.Width)) if b.Pos == b.Buf.Size() { b.LineHasSpace.Remove(b.DisplayPos/b.LineWidth - 1) } b.DisplayPos -= 1 } else { - fmt.Print(cursorLeftN(rLength)) + fmt.Print(CursorLeftN(rLength)) for range rLength { fmt.Print(" ") } - fmt.Print(cursorLeftN(rLength)) + fmt.Print(CursorLeftN(rLength)) } var eraseExtraLine bool @@ -405,9 +400,9 @@ func (b *Buffer) Remove() { // are trailing characters which go over the line width boundary if eraseExtraLine { remainingLines := (b.DisplaySize() - b.DisplayPos) / b.LineWidth - fmt.Printf(cursorDownN(remainingLines+1) + CursorBOL + ClearToEOL) + fmt.Print(CursorDownN(remainingLines+1) + CursorBOL + ClearToEOL) place := b.DisplayPos % b.LineWidth - fmt.Printf(cursorUpN(remainingLines+1) + cursorRightN(place+len(b.Prompt.prompt()))) + fmt.Print(CursorUpN(remainingLines+1) + CursorRightN(place+len(b.Prompt.prompt()))) } } } @@ -422,9 +417,9 @@ func (b *Buffer) Delete() { if b.DisplaySize()%b.LineWidth == 0 { if b.DisplayPos != b.DisplaySize() { remainingLines := (b.DisplaySize() - b.DisplayPos) / b.LineWidth - fmt.Printf(cursorDownN(remainingLines) + CursorBOL + ClearToEOL) + fmt.Print(CursorDownN(remainingLines) + CursorBOL + ClearToEOL) place := b.DisplayPos % b.LineWidth - fmt.Printf(cursorUpN(remainingLines) + cursorRightN(place+len(b.Prompt.prompt()))) + fmt.Print(CursorUpN(remainingLines) + CursorRightN(place+len(b.Prompt.prompt()))) } } } @@ -471,17 +466,17 @@ func (b *Buffer) DeleteWord() { } func (b *Buffer) ClearScreen() { - fmt.Printf(ClearScreen + CursorReset + b.Prompt.prompt()) + fmt.Print(ClearScreen + CursorReset + b.Prompt.prompt()) if b.IsEmpty() { ph := b.Prompt.placeholder() - fmt.Printf(ColorGrey + ph + cursorLeftN(len(ph)) + ColorDefault) + fmt.Print(ColorGrey + ph + CursorLeftN(len(ph)) + ColorDefault) } else { currPos := b.DisplayPos currIndex := b.Pos b.Pos = 0 b.DisplayPos = 0 b.drawRemaining() - fmt.Printf(CursorReset + cursorRightN(len(b.Prompt.prompt()))) + fmt.Print(CursorReset + CursorRightN(len(b.Prompt.prompt()))) if currPos > 0 { targetLine := currPos / b.LineWidth if targetLine > 0 { @@ -491,10 +486,10 @@ func (b *Buffer) ClearScreen() { } remainder := currPos % b.LineWidth if remainder > 0 { - fmt.Print(cursorRightN(remainder)) + fmt.Print(CursorRightN(remainder)) } if currPos%b.LineWidth == 0 { - fmt.Printf(CursorBOL + b.Prompt.AltPrompt) + fmt.Print(CursorBOL + b.Prompt.AltPrompt) } } b.Pos = currIndex @@ -513,13 +508,13 @@ func (b *Buffer) Replace(r []rune) { b.Buf.Clear() - fmt.Printf(CursorBOL + ClearToEOL) + fmt.Print(CursorBOL + ClearToEOL) for range lineNums { fmt.Print(CursorUp + CursorBOL + ClearToEOL) } - fmt.Printf(CursorBOL + b.Prompt.prompt()) + fmt.Print(CursorBOL + b.Prompt.prompt()) for _, c := range r { b.Add(c) @@ -545,19 +540,3 @@ func (b *Buffer) StringNM(n, m int) string { } return s } - -func cursorLeftN(n int) string { - return fmt.Sprintf(CursorLeftN, n) -} - -func cursorRightN(n int) string { - return fmt.Sprintf(CursorRightN, n) -} - -func cursorUpN(n int) string { - return fmt.Sprintf(CursorUpN, n) -} - -func cursorDownN(n int) string { - return fmt.Sprintf(CursorDownN, n) -} diff --git a/readline/readline.go b/readline/readline.go index e90a5e01..1c14fe10 100644 --- a/readline/readline.go +++ b/readline/readline.go @@ -98,7 +98,7 @@ func (i *Instance) Readline() (string, error) { showPlaceholder := !i.Pasting || i.Prompt.UseAlt if buf.IsEmpty() && showPlaceholder { ph := i.Prompt.placeholder() - fmt.Printf(ColorGrey + ph + fmt.Sprintf(CursorLeftN, len(ph)) + ColorDefault) + fmt.Print(ColorGrey + ph + CursorLeftN(len(ph)) + ColorDefault) } r, err := i.Terminal.Read() diff --git a/readline/types.go b/readline/types.go index 3b88588f..e136d996 100644 --- a/readline/types.go +++ b/readline/types.go @@ -1,5 +1,7 @@ package readline +import "strconv" + const ( CharNull = 0 CharLineStart = 1 @@ -41,34 +43,49 @@ const ( ) const ( - CursorUp = "\033[1A" - CursorDown = "\033[1B" - CursorRight = "\033[1C" - CursorLeft = "\033[1D" + Esc = "\x1b" - CursorSave = "\033[s" - CursorRestore = "\033[u" + CursorSave = Esc + "[s" + CursorRestore = Esc + "[u" - CursorUpN = "\033[%dA" - CursorDownN = "\033[%dB" - CursorRightN = "\033[%dC" - CursorLeftN = "\033[%dD" + CursorEOL = Esc + "[E" + CursorBOL = Esc + "[1G" + CursorHide = Esc + "[?25l" + CursorShow = Esc + "[?25h" - CursorEOL = "\033[E" - CursorBOL = "\033[1G" - CursorHide = "\033[?25l" - CursorShow = "\033[?25h" + ClearToEOL = Esc + "[K" + ClearLine = Esc + "[2K" + ClearScreen = Esc + "[2J" + CursorReset = Esc + "[0;0f" - ClearToEOL = "\033[K" - ClearLine = "\033[2K" - ClearScreen = "\033[2J" - CursorReset = "\033[0;0f" + ColorGrey = Esc + "[38;5;245m" + ColorDefault = Esc + "[0m" - ColorGrey = "\033[38;5;245m" - ColorDefault = "\033[0m" + StartBracketedPaste = Esc + "[?2004h" + EndBracketedPaste = Esc + "[?2004l" +) - StartBracketedPaste = "\033[?2004h" - EndBracketedPaste = "\033[?2004l" +func CursorUpN(n int) string { + return Esc + "[" + strconv.Itoa(n) + "A" +} + +func CursorDownN(n int) string { + return Esc + "[" + strconv.Itoa(n) + "B" +} + +func CursorRightN(n int) string { + return Esc + "[" + strconv.Itoa(n) + "C" +} + +func CursorLeftN(n int) string { + return Esc + "[" + strconv.Itoa(n) + "D" +} + +var ( + CursorUp = CursorUpN(1) + CursorDown = CursorDownN(1) + CursorRight = CursorRightN(1) + CursorLeft = CursorLeftN(1) ) const ( diff --git a/server/sched.go b/server/sched.go index c378865b..9947fd32 100644 --- a/server/sched.go +++ b/server/sched.go @@ -418,7 +418,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, // some older models are not compatible with newer versions of llama.cpp // show a generalized compatibility error until there is a better way to // check for model compatibility - if errors.Is(llm.ErrUnsupportedFormat, err) || strings.Contains(err.Error(), "failed to load model") { + if errors.Is(err, llm.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") { err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName) } slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err) From eda8a32a0936c1aec120b3c544e402cbba7b7eb7 Mon Sep 17 00:00:00 2001 From: Bruce MacDonald Date: Tue, 13 Aug 2024 23:39:18 +0000 Subject: [PATCH 40/71] update chatml template format to latest in docs (#6344) --- docs/template.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/docs/template.md b/docs/template.md index f6ce06ba..1d7104de 100644 --- a/docs/template.md +++ b/docs/template.md @@ -112,15 +112,9 @@ Keep the following tips and best practices in mind when working with Go template ChatML is a popular template format. It can be used for models such as Databrick's DBRX, Intel's Neural Chat, and Microsoft's Orca 2. ```gotmpl -{{- if .System }}<|im_start|>system -{{ .System }}<|im_end|> -{{ end }} {{- range .Messages }}<|im_start|>{{ .Role }} {{ .Content }}<|im_end|> {{ end }}<|im_start|>assistant -{{ else }} -{{ if .System }}<|im_start|>system -{{ .System }}<|im_end|> ``` ### Example Tools From 8e1050f366e5451651f8385fa570b78b9c7d21cc Mon Sep 17 00:00:00 2001 From: Blake Mizerany Date: Tue, 13 Aug 2024 16:47:35 -0700 Subject: [PATCH 41/71] server: reduce max connections used in download (#6347) The previous value of 64 was WAY too high and unnecessary. It reached diminishing returns and blew past it. This is a more reasonable number for _most_ normal cases. For users on cloud servers with excellent network quality, this will keep screaming for them, without hitting our CDN limits. For users with relatively poor network quality, this will keep them from saturating their network and causing other issues. --- server/download.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/download.go b/server/download.go index 1bca86bf..02f7ae88 100644 --- a/server/download.go +++ b/server/download.go @@ -94,7 +94,7 @@ func (p *blobDownloadPart) UnmarshalJSON(b []byte) error { } const ( - numDownloadParts = 64 + numDownloadParts = 16 minDownloadPartSize int64 = 100 * format.MegaByte maxDownloadPartSize int64 = 1000 * format.MegaByte ) From 0a8d6ea86d54bbda9d701c38e4279a9c5c204cd9 Mon Sep 17 00:00:00 2001 From: longtao <39115651+eust-w@users.noreply.github.com> Date: Wed, 14 Aug 2024 08:54:19 +0800 Subject: [PATCH 42/71] Fix typo and improve readability (#5964) * Fix typo and improve readability Summary: * Rename updatAvailableMenuID to updateAvailableMenuID * Replace unused cmd parameter with _ in RunServer function * Fix typos in comments (cherry picked from commit 5b8715f0b04773369e8eb1f9e6737995a0ab3ba7) * Update api/client.go Co-authored-by: Jeffrey Morgan --------- Co-authored-by: Jeffrey Morgan --- api/client.go | 4 ++-- app/tray/wintray/menus.go | 14 +++++++------- cmd/cmd.go | 2 +- types/model/name.go | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/api/client.go b/api/client.go index bbdf8202..2528fb21 100644 --- a/api/client.go +++ b/api/client.go @@ -298,7 +298,7 @@ func (c *Client) List(ctx context.Context) (*ListResponse, error) { return &lr, nil } -// List running models. +// ListRunning lists running models. func (c *Client) ListRunning(ctx context.Context) (*ProcessResponse, error) { var lr ProcessResponse if err := c.do(ctx, http.MethodGet, "/api/ps", nil, &lr); err != nil { @@ -333,7 +333,7 @@ func (c *Client) Show(ctx context.Context, req *ShowRequest) (*ShowResponse, err return &resp, nil } -// Hearbeat checks if the server has started and is responsive; if yes, it +// Heartbeat checks if the server has started and is responsive; if yes, it // returns nil, otherwise an error. func (c *Client) Heartbeat(ctx context.Context) error { if err := c.do(ctx, http.MethodHead, "/", nil, nil); err != nil { diff --git a/app/tray/wintray/menus.go b/app/tray/wintray/menus.go index 9cb3b893..59624444 100644 --- a/app/tray/wintray/menus.go +++ b/app/tray/wintray/menus.go @@ -11,12 +11,12 @@ import ( ) const ( - updatAvailableMenuID = 1 - updateMenuID = updatAvailableMenuID + 1 - separatorMenuID = updateMenuID + 1 - diagLogsMenuID = separatorMenuID + 1 - diagSeparatorMenuID = diagLogsMenuID + 1 - quitMenuID = diagSeparatorMenuID + 1 + updateAvailableMenuID = 1 + updateMenuID = updateAvailableMenuID + 1 + separatorMenuID = updateMenuID + 1 + diagLogsMenuID = separatorMenuID + 1 + diagSeparatorMenuID = diagLogsMenuID + 1 + quitMenuID = diagSeparatorMenuID + 1 ) func (t *winTray) initMenus() error { @@ -35,7 +35,7 @@ func (t *winTray) initMenus() error { func (t *winTray) UpdateAvailable(ver string) error { if !t.updateNotified { slog.Debug("updating menu and sending notification for new update") - if err := t.addOrUpdateMenuItem(updatAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil { + if err := t.addOrUpdateMenuItem(updateAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil { return fmt.Errorf("unable to create menu entries %w", err) } if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenutTitle, false); err != nil { diff --git a/cmd/cmd.go b/cmd/cmd.go index 2356110e..fd7246c8 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -1125,7 +1125,7 @@ func generate(cmd *cobra.Command, opts runOptions) error { return nil } -func RunServer(cmd *cobra.Command, _ []string) error { +func RunServer(_ *cobra.Command, _ []string) error { if err := initializeKeypair(); err != nil { return err } diff --git a/types/model/name.go b/types/model/name.go index 018cb2f5..75b35ef7 100644 --- a/types/model/name.go +++ b/types/model/name.go @@ -219,7 +219,7 @@ func (n Name) String() string { return b.String() } -// DisplayShort returns a short string version of the name. +// DisplayShortest returns a short string version of the name. func (n Name) DisplayShortest() string { var sb strings.Builder From 8200c371aed68dec5c74e869491ee8e5749ba1eb Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Wed, 14 Aug 2024 15:19:50 -0700 Subject: [PATCH 43/71] add `CONTRIBUTING.md` (#6349) --- CONTRIBUTING.md | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..7f12a0fc --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,37 @@ +# Contributing to Ollama + +Thank you for your interest in contributing to Ollama! Here are a few guidelines to help get you started. + +## Set up + +See the [development documentation](./docs/development.md) for instructions on how to build and run Ollama locally. + +## Pull requests + +### Ideal issues + +* [Bugs](https://github.com/ollama/ollama/issues?q=is%3Aissue+is%3Aopen+label%3Abug): issues where Ollama stops working or where it results in an unexpected error. +* [Performance](https://github.com/ollama/ollama/issues?q=is%3Aissue+is%3Aopen+label%3Aperformance): issues to make Ollama faster at model inference, downloading or uploading. +* [Security](https://github.com/ollama/ollama/blob/main/SECURITY.md): issues that could lead to a security vulnerability. As mentioned in [SECURITY.md](https://github.com/ollama/ollama/blob/main/SECURITY.md), please do not disclose security vulnerabilities publicly. + +### Issues that are harder to review + +* New features: new features (e.g. API fields, environment variables) add surface area to Ollama and make it harder to maintain in the long run as they cannot be removed without potentially breaking users in the future. +* Refactoring: large code improvements are important, but can be harder or take longer to review and merge. +* Documentation: small updates to fill in or dorrect missing documentation is helpful, however large documentation additions can be hard to maintain over time. + +### Issues that may not be accepted + +* Changes that break backwards compatibility in Ollama's API (including the OpenAI-compatible API) +* Changes that add significant friction to the user experience +* Changes that create a large future maintenance burden for maintainers and contributors + +### Best practices + +* Commit messages: please leave both a title and a description in your commit messages. The title should be a short summary of the changes, with a leading word that explains the section of the code being changed (e.g. `api: fix parsing of prompt field`) . In the description, leave a short 2-3 sentences that explain more about the change and its impact. +* Tests: please add test coverage to changes where possible. +* Minimize dependencies: avoid adding new dependencies unless absolutely necessary. + +## Need help? + +If you need help with anything, feel free to reach out to us on our [Discord server](https://discord.gg/ollama). From b3f75fc812fc1559090a7fd9739bd203817a5979 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Wed, 14 Aug 2024 14:37:51 -0700 Subject: [PATCH 44/71] fix noprune --- server/images.go | 63 ++++++++++++++++++------------------------------ 1 file changed, 24 insertions(+), 39 deletions(-) diff --git a/server/images.go b/server/images.go index 0e753f56..798ed818 100644 --- a/server/images.go +++ b/server/images.go @@ -215,25 +215,20 @@ func GetManifest(mp ModelPath) (*Manifest, string, error) { return nil, "", err } - if _, err = os.Stat(fp); err != nil { - return nil, "", err - } - - var manifest *Manifest - - bts, err := os.ReadFile(fp) + f, err := os.Open(fp) if err != nil { - return nil, "", fmt.Errorf("couldn't open file '%s'", fp) + return nil, "", err } + defer f.Close() - shaSum := sha256.Sum256(bts) - shaStr := hex.EncodeToString(shaSum[:]) + sha256sum := sha256.New() - if err := json.Unmarshal(bts, &manifest); err != nil { + var manifest Manifest + if err := json.NewDecoder(io.TeeReader(f, sha256sum)).Decode(&manifest); err != nil { return nil, "", err } - return manifest, shaStr, nil + return &manifest, hex.EncodeToString(sha256sum.Sum(nil)), nil } func GetModel(name string) (*Model, error) { @@ -716,7 +711,7 @@ func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}) // save (i.e. delete from the deleteMap) any files used in other manifests manifest, _, err := GetManifest(fmp) if err != nil { - return err + return fmt.Errorf("error reading manifest %s: %w", path, err) } for _, layer := range manifest.Layers { @@ -781,8 +776,7 @@ func PruneLayers() error { slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap))) - err = deleteUnusedLayers(nil, deleteMap) - if err != nil { + if err := deleteUnusedLayers(nil, deleteMap); err != nil { slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err)) return nil } @@ -877,26 +871,19 @@ func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn fu func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn func(api.ProgressResponse)) error { mp := ParseModelPath(name) - var manifest *Manifest - var err error - var noprune string - // build deleteMap to prune unused layers deleteMap := make(map[string]struct{}) - - if !envconfig.NoPrune() { - manifest, _, err = GetManifest(mp) - if err != nil && !errors.Is(err, os.ErrNotExist) { - return err + manifest, _, err := GetManifest(mp) + if errors.Is(err, os.ErrNotExist) { + // noop + } else if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } else { + for _, l := range manifest.Layers { + deleteMap[l.Digest] = struct{}{} } - - if manifest != nil { - for _, l := range manifest.Layers { - deleteMap[l.Digest] = struct{}{} - } - if manifest.Config.Digest != "" { - deleteMap[manifest.Config.Digest] = struct{}{} - } + if manifest.Config.Digest != "" { + deleteMap[manifest.Config.Digest] = struct{}{} } } @@ -975,11 +962,9 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu return err } - if noprune == "" { - fn(api.ProgressResponse{Status: "removing any unused layers"}) - err = deleteUnusedLayers(nil, deleteMap) - if err != nil { - slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err)) + if !envconfig.NoPrune() && len(deleteMap) > 0 { + fn(api.ProgressResponse{Status: "removing unused layers"}) + if err := deleteUnusedLayers(nil, deleteMap); err != nil { fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't remove unused layers: %v", err)}) } } @@ -1000,12 +985,12 @@ func pullModelManifest(ctx context.Context, mp ModelPath, regOpts *registryOptio } defer resp.Body.Close() - var m *Manifest + var m Manifest if err := json.NewDecoder(resp.Body).Decode(&m); err != nil { return nil, err } - return m, err + return &m, err } // GetSHA256Digest returns the SHA256 hash of a given buffer and returns it, and the size of buffer From 237dccba1edb41bb65ed1ffc6eafdd40dd6085e4 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Wed, 14 Aug 2024 16:36:07 -0700 Subject: [PATCH 45/71] skip invalid manifest files --- server/images.go | 35 +++++------------------------------ server/manifest.go | 2 +- 2 files changed, 6 insertions(+), 31 deletions(-) diff --git a/server/images.go b/server/images.go index 798ed818..8b3a67cf 100644 --- a/server/images.go +++ b/server/images.go @@ -687,43 +687,18 @@ func CopyModel(src, dst model.Name) error { return err } -func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}) error { - fp, err := GetManifestPath() +func deleteUnusedLayers(deleteMap map[string]struct{}) error { + manifests, err := Manifests() if err != nil { return err } - walkFunc := func(path string, info os.FileInfo, _ error) error { - if info.IsDir() { - return nil - } - - dir, file := filepath.Split(path) - dir = strings.Trim(strings.TrimPrefix(dir, fp), string(os.PathSeparator)) - tag := strings.Join([]string{dir, file}, ":") - fmp := ParseModelPath(tag) - - // skip the manifest we're trying to delete - if skipModelPath != nil && skipModelPath.GetFullTagname() == fmp.GetFullTagname() { - return nil - } - - // save (i.e. delete from the deleteMap) any files used in other manifests - manifest, _, err := GetManifest(fmp) - if err != nil { - return fmt.Errorf("error reading manifest %s: %w", path, err) - } - + for _, manifest := range manifests { for _, layer := range manifest.Layers { delete(deleteMap, layer.Digest) } delete(deleteMap, manifest.Config.Digest) - return nil - } - - if err := filepath.Walk(fp, walkFunc); err != nil { - return err } // only delete the files which are still in the deleteMap @@ -776,7 +751,7 @@ func PruneLayers() error { slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap))) - if err := deleteUnusedLayers(nil, deleteMap); err != nil { + if err := deleteUnusedLayers(deleteMap); err != nil { slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err)) return nil } @@ -964,7 +939,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu if !envconfig.NoPrune() && len(deleteMap) > 0 { fn(api.ProgressResponse{Status: "removing unused layers"}) - if err := deleteUnusedLayers(nil, deleteMap); err != nil { + if err := deleteUnusedLayers(deleteMap); err != nil { fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't remove unused layers: %v", err)}) } } diff --git a/server/manifest.go b/server/manifest.go index 6a5d7b88..0f19641d 100644 --- a/server/manifest.go +++ b/server/manifest.go @@ -150,7 +150,7 @@ func Manifests() (map[model.Name]*Manifest, error) { n := model.ParseNameFromFilepath(rel) if !n.IsValid() { - slog.Warn("bad manifest name", "path", rel, "error", err) + slog.Warn("bad manifest name", "path", rel) continue } From 3a75e74e34c976d596437c8aa14587ada562301e Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 15 Aug 2024 10:29:14 -0700 Subject: [PATCH 46/71] only skip invalid json manifests --- server/manifest.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/server/manifest.go b/server/manifest.go index 0f19641d..6b04753f 100644 --- a/server/manifest.go +++ b/server/manifest.go @@ -5,6 +5,7 @@ import ( "encoding/hex" "encoding/json" "errors" + "fmt" "io" "log/slog" "os" @@ -155,9 +156,11 @@ func Manifests() (map[model.Name]*Manifest, error) { } m, err := ParseNamedManifest(n) - if err != nil { + if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) { slog.Warn("bad manifest", "name", n, "error", err) continue + } else if err != nil { + return nil, fmt.Errorf("%s: %w", n, err) } ms[n] = m From a84c05cf9140c2eb288a6c7b56bb1c592bbaacc7 Mon Sep 17 00:00:00 2001 From: eust-w Date: Fri, 16 Aug 2024 06:00:12 +0800 Subject: [PATCH 47/71] fix: Add tooltip to system tray icon - Updated setIcon method to include tooltip text for the system tray icon. - Added NIF_TIP flag and set the tooltip text using UTF16 encoding. Resolves: #6372 --- app/tray/wintray/tray.go | 8 +++++++- app/tray/wintray/w32api.go | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/app/tray/wintray/tray.go b/app/tray/wintray/tray.go index ccd087a1..6f827893 100644 --- a/app/tray/wintray/tray.go +++ b/app/tray/wintray/tray.go @@ -11,6 +11,7 @@ import ( "path/filepath" "sort" "sync" + "syscall" "unsafe" "golang.org/x/sys/windows" @@ -433,7 +434,12 @@ func (t *winTray) setIcon(src string) error { t.muNID.Lock() defer t.muNID.Unlock() t.nid.Icon = h - t.nid.Flags |= NIF_ICON + t.nid.Flags |= NIF_ICON | NIF_TIP + if toolTipUTF16, err := syscall.UTF16FromString(commontray.ToolTip); err == nil { + copy(t.nid.Tip[:], toolTipUTF16) + } else { + return err + } t.nid.Size = uint32(unsafe.Sizeof(*t.nid)) return t.nid.modify() diff --git a/app/tray/wintray/w32api.go b/app/tray/wintray/w32api.go index a1e0381d..7c7c0ac8 100644 --- a/app/tray/wintray/w32api.go +++ b/app/tray/wintray/w32api.go @@ -61,6 +61,7 @@ const ( MIIM_SUBMENU = 0x00000004 MIM_APPLYTOSUBMENUS = 0x80000000 NIF_ICON = 0x00000002 + NIF_TIP = 0x00000004 NIF_INFO = 0x00000010 NIF_MESSAGE = 0x00000001 SW_HIDE = 0 From bdc4308afb72d47ce63583427f810b02d569d58a Mon Sep 17 00:00:00 2001 From: zwwhdls Date: Fri, 16 Aug 2024 11:43:19 +0800 Subject: [PATCH 48/71] fix: chmod new layer to 0o644 when creating it Signed-off-by: zwwhdls --- server/layer.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/server/layer.go b/server/layer.go index c666bd10..0bdee72b 100644 --- a/server/layer.go +++ b/server/layer.go @@ -51,6 +51,9 @@ func NewLayer(r io.Reader, mediatype string) (Layer, error) { if err := os.Rename(temp.Name(), blob); err != nil { return Layer{}, err } + if err := os.Chmod(blob, 0o644); err != nil { + return Layer{}, err + } } return Layer{ From 0ad0e738cd7ed1266b3c210ad54dcd2b70142563 Mon Sep 17 00:00:00 2001 From: Richard Lyons Date: Sun, 18 Aug 2024 01:43:26 +0200 Subject: [PATCH 49/71] Override numParallel only if unset. --- server/sched.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/server/sched.go b/server/sched.go index 9947fd32..4d9c0296 100644 --- a/server/sched.go +++ b/server/sched.go @@ -734,7 +734,9 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL // If multiple Libraries are detected, pick the Library which loads the most layers for the model func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { - *numParallel = 1 + if *numParallel <= 0 { + *numParallel = 1 + } byLibrary := gpus.ByLibrary() if len(byLibrary) <= 1 { return gpus From 9352eeb752531decccc7c6b91a07bc3dd5efa67e Mon Sep 17 00:00:00 2001 From: Richard Lyons Date: Sun, 18 Aug 2024 02:55:01 +0200 Subject: [PATCH 50/71] Reset NumCtx. --- server/sched.go | 1 + 1 file changed, 1 insertion(+) diff --git a/server/sched.go b/server/sched.go index 4d9c0296..3fe6d7fc 100644 --- a/server/sched.go +++ b/server/sched.go @@ -736,6 +736,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { if *numParallel <= 0 { *numParallel = 1 + req.opts.NumCtx = req.origNumCtx } byLibrary := gpus.ByLibrary() if len(byLibrary) <= 1 { From 885cf45087863aa2e064a05da99e8bd07d69970a Mon Sep 17 00:00:00 2001 From: Richard Lyons Date: Sun, 18 Aug 2024 03:07:16 +0200 Subject: [PATCH 51/71] Fix white space. --- server/sched.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/sched.go b/server/sched.go index 3fe6d7fc..9d8c4144 100644 --- a/server/sched.go +++ b/server/sched.go @@ -736,8 +736,8 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { if *numParallel <= 0 { *numParallel = 1 - req.opts.NumCtx = req.origNumCtx - } + req.opts.NumCtx = req.origNumCtx + } byLibrary := gpus.ByLibrary() if len(byLibrary) <= 1 { return gpus From 9fddef3731842bd8f40d217da6b84ab7ef5dfe97 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Mon, 19 Aug 2024 09:20:52 -0700 Subject: [PATCH 52/71] server: limit upload parts to 16 (#6411) --- server/upload.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/upload.go b/server/upload.go index 2f115436..020e8955 100644 --- a/server/upload.go +++ b/server/upload.go @@ -45,7 +45,7 @@ type blobUpload struct { } const ( - numUploadParts = 64 + numUploadParts = 16 minUploadPartSize int64 = 100 * format.MegaByte maxUploadPartSize int64 = 1000 * format.MegaByte ) From 74d45f010276c2f2653f3ca8c4f76cb0552fb46e Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 8 Jul 2024 12:50:11 -0700 Subject: [PATCH 53/71] Refactor linux packaging This adjusts linux to follow a similar model to windows with a discrete archive (zip/tgz) to cary the primary executable, and dependent libraries. Runners are still carried as payloads inside the main binary Darwin retain the payload model where the go binary is fully self contained. --- .github/workflows/release.yaml | 1 - Dockerfile | 29 ++++++------ app/ollama.iss | 11 +---- envconfig/config.go | 4 +- gpu/amd_common.go | 2 +- gpu/amd_windows.go | 2 +- gpu/gpu.go | 50 ++++++++++++++------- gpu/gpu_linux.go | 2 +- llm/ext_server/CMakeLists.txt | 3 +- llm/generate/gen_common.sh | 17 ++++++- llm/generate/gen_linux.sh | 81 ++++++++++++++++------------------ llm/generate/gen_windows.ps1 | 43 +++++++++--------- llm/server.go | 12 +++-- scripts/build_linux.sh | 10 ++--- scripts/build_windows.ps1 | 12 ++--- scripts/install.sh | 31 ++++++++++--- 16 files changed, 171 insertions(+), 139 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 5ae630c3..9287f6f7 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -363,7 +363,6 @@ jobs: - run: | ./scripts/build_linux.sh ./scripts/build_docker.sh - mv dist/deps/* dist/ - uses: actions/upload-artifact@v4 with: name: dist-linux-amd64 diff --git a/Dockerfile b/Dockerfile index c8efdd8a..120ddc21 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,6 +18,7 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS +ENV GOARCH amd64 RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64 @@ -28,6 +29,7 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS +ENV GOARCH arm64 RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64 @@ -40,15 +42,10 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ARG AMDGPU_TARGETS +ENV GOARCH amd64 RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh -RUN mkdir /tmp/scratch && \ - for dep in $(zcat /go/src/github.com/ollama/ollama/llm/build/linux/x86_64/rocm*/bin/deps.txt.gz) ; do \ - cp ${dep} /tmp/scratch/ || exit 1 ; \ - done && \ - (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd /tmp/scratch/ && tar xf - ) && \ - mkdir -p /go/src/github.com/ollama/ollama/dist/deps/ && \ - (cd /tmp/scratch/ && tar czvf /go/src/github.com/ollama/ollama/dist/deps/ollama-linux-amd64-rocm.tgz . ) - +RUN mkdir -p ../../dist/linux-amd64/ollama_libs && \ + (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64/ollama_libs && tar xf - ) FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64 ARG CMAKE_VERSION @@ -59,6 +56,7 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ ARG OLLAMA_CUSTOM_CPU_DEFS ARG CGO_CFLAGS +ENV GOARCH amd64 WORKDIR /go/src/github.com/ollama/ollama/llm/generate FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64 @@ -79,6 +77,7 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ ARG OLLAMA_CUSTOM_CPU_DEFS ARG CGO_CFLAGS +ENV GOARCH arm64 WORKDIR /go/src/github.com/ollama/ollama/llm/generate FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64 @@ -95,12 +94,13 @@ COPY . . COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/ ARG GOFLAGS ARG CGO_CFLAGS -RUN go build -trimpath . +RUN go build -trimpath -o dist/linux-amd64/ollama . # Intermediate stage used for ./scripts/build_linux.sh FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64 @@ -109,23 +109,24 @@ ARG GOLANG_VERSION WORKDIR /go/src/github.com/ollama/ollama COPY . . COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ ARG GOFLAGS ARG CGO_CFLAGS -RUN go build -trimpath . +RUN go build -trimpath -o dist/linux-arm64/ollama . # Runtime stages FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64 RUN apt-get update && apt-get install -y ca-certificates -COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama +COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/ollama /bin/ollama FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64 RUN apt-get update && apt-get install -y ca-certificates -COPY --from=build-arm64 /go/src/github.com/ollama/ollama/ollama /bin/ollama +COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/ollama /bin/ollama # Radeon images are much larger so we keep it distinct from the CPU/CUDA image FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete as runtime-rocm RUN update-pciids -COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama +COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/ollama /bin/ollama EXPOSE 11434 ENV OLLAMA_HOST 0.0.0.0 diff --git a/app/ollama.iss b/app/ollama.iss index dc6178f7..e9cf48ec 100644 --- a/app/ollama.iss +++ b/app/ollama.iss @@ -91,16 +91,7 @@ Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion -#if DirExists("..\dist\windows-amd64\cuda") - Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs -#endif -#if DirExists("..\dist\windows-amd64\oneapi") - Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs -#endif -#if DirExists("..\dist\windows-amd64\rocm") - Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs -#endif - +Source: "..\dist\windows-amd64\ollama_libs\*"; DestDir: "{app}\ollama_libs\"; Flags: ignoreversion recursesubdirs [Icons] Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico" diff --git a/envconfig/config.go b/envconfig/config.go index b82b773d..7f0976c0 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -193,8 +193,8 @@ func RunnersDir() (p string) { for _, root := range []string{filepath.Dir(exe), cwd} { paths = append(paths, root, - filepath.Join(root, "windows-"+runtime.GOARCH), - filepath.Join(root, "dist", "windows-"+runtime.GOARCH), + filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH), + filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH), ) } diff --git a/gpu/amd_common.go b/gpu/amd_common.go index 2839cb7c..05747208 100644 --- a/gpu/amd_common.go +++ b/gpu/amd_common.go @@ -54,7 +54,7 @@ func commonAMDValidateLibDir() (string, error) { // Installer payload location if we're running the installed binary exe, err := os.Executable() if err == nil { - rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm") + rocmTargetDir := filepath.Join(filepath.Dir(exe), "ollama_libs") if rocmLibUsable(rocmTargetDir) { slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir) return rocmTargetDir, nil diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go index edabeb43..5d25a966 100644 --- a/gpu/amd_windows.go +++ b/gpu/amd_windows.go @@ -153,7 +153,7 @@ func AMDValidateLibDir() (string, error) { // Installer payload (if we're running from some other location) localAppData := os.Getenv("LOCALAPPDATA") appDir := filepath.Join(localAppData, "Programs", "Ollama") - rocmTargetDir := filepath.Join(appDir, "rocm") + rocmTargetDir := filepath.Join(appDir, "ollama_libs") if rocmLibUsable(rocmTargetDir) { slog.Debug("detected ollama installed ROCm at " + rocmTargetDir) return rocmTargetDir, nil diff --git a/gpu/gpu.go b/gpu/gpu.go index dc124a3e..d0ae0f34 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -229,11 +229,7 @@ func GetGPUInfo() GpuInfoList { return GpuInfoList{cpus[0].GpuInfo} } - // On windows we bundle the nvidia library one level above the runner dir - depPath := "" - if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" { - depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "cuda") - } + depPath := GetDepDir() // Load ALL libraries cHandles = initCudaHandles() @@ -306,13 +302,6 @@ func GetGPUInfo() GpuInfoList { if envconfig.IntelGPU() { oHandles = initOneAPIHandles() if oHandles != nil && oHandles.oneapi != nil { - - // On windows we bundle the oneapi library one level above the runner dir - depPath = "" - if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" { - depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "oneapi") - } - for d := range oHandles.oneapi.num_drivers { if oHandles.oneapi == nil { // shouldn't happen @@ -467,10 +456,12 @@ func GetGPUInfo() GpuInfoList { func FindGPULibs(baseLibName string, defaultPatterns []string) []string { // Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them var ldPaths []string - var patterns []string gpuLibPaths := []string{} slog.Debug("Searching for GPU library", "name", baseLibName) + // Start with our bundled libraries + patterns := []string{filepath.Join(GetDepDir(), baseLibName)} + switch runtime.GOOS { case "windows": ldPaths = strings.Split(os.Getenv("PATH"), ";") @@ -479,13 +470,14 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string { default: return gpuLibPaths } - // Start with whatever we find in the PATH/LD_LIBRARY_PATH + + // Then with whatever we find in the PATH/LD_LIBRARY_PATH for _, ldPath := range ldPaths { d, err := filepath.Abs(ldPath) if err != nil { continue } - patterns = append(patterns, filepath.Join(d, baseLibName+"*")) + patterns = append(patterns, filepath.Join(d, baseLibName)) } patterns = append(patterns, defaultPatterns...) slog.Debug("gpu library search", "globs", patterns) @@ -641,3 +633,31 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) { return "", "" } } + +func GetDepDir() string { + // On Windows/linux we bundle the dependencies at the same level as the executable + appExe, err := os.Executable() + if err != nil { + slog.Warn("failed to lookup executable path", "error", err) + } + cwd, err := os.Getwd() + if err != nil { + slog.Warn("failed to lookup working directory", "error", err) + } + // Scan for any of our dependeices, and pick first match + for _, root := range []string{filepath.Dir(appExe), cwd} { + libDep := "ollama_libs" + if _, err := os.Stat(filepath.Join(root, libDep)); err == nil { + return filepath.Join(root, libDep) + } + // Developer mode, local build + if _, err := os.Stat(filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil { + return filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep) + } + if _, err := os.Stat(filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil { + return filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep) + } + } + slog.Warn("unable to locate gpu dependency libraries") + return "" +} diff --git a/gpu/gpu_linux.go b/gpu/gpu_linux.go index d6d2675c..d4d20bc4 100644 --- a/gpu/gpu_linux.go +++ b/gpu/gpu_linux.go @@ -47,7 +47,7 @@ var ( CudartMgmtName = "libcudart.so*" NvcudaMgmtName = "libcuda.so*" NvmlMgmtName = "" // not currently wired on linux - OneapiMgmtName = "libze_intel_gpu.so" + OneapiMgmtName = "libze_intel_gpu.so*" ) func GetCPUMem() (memInfo, error) { diff --git a/llm/ext_server/CMakeLists.txt b/llm/ext_server/CMakeLists.txt index bfc97c63..90fd0ef2 100644 --- a/llm/ext_server/CMakeLists.txt +++ b/llm/ext_server/CMakeLists.txt @@ -1,12 +1,13 @@ set(TARGET ollama_llama_server) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) +set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS}) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h) install(TARGETS ${TARGET} RUNTIME) target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$ ) -target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_SERVER_LDFLAGS}) if (WIN32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) endif() diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh index da1b0688..f1541f2a 100644 --- a/llm/generate/gen_common.sh +++ b/llm/generate/gen_common.sh @@ -9,11 +9,14 @@ init_vars() { ARCH="arm64" ;; *) - ARCH=$(uname -m | sed -e "s/aarch64/arm64/g") + echo "GOARCH must be set" + echo "this script is meant to be run from within go generate" + exit 1 + ;; esac LLAMACPP_DIR=../llama.cpp - CMAKE_DEFS="" + CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on" CMAKE_TARGETS="--target ollama_llama_server" if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}" @@ -27,6 +30,7 @@ init_vars() { WHOLE_ARCHIVE="-Wl,-force_load" NO_WHOLE_ARCHIVE="" GCC_ARCH="-arch ${ARCH}" + DIST_BASE=../../dist/darwin-${GOARCH}/ ;; "Linux") LIB_EXT="so" @@ -35,6 +39,7 @@ init_vars() { # Cross compiling not supported on linux - Use docker GCC_ARCH="" + DIST_BASE=../../dist/linux-${GOARCH}/ ;; *) ;; @@ -105,6 +110,14 @@ compress() { echo "Finished compression" } +install() { + echo "Installing libraries to bin dir ${BUILD_DIR}/bin/" + for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT}); do + rm -f "${BUILD_DIR}/bin/$(basename ${lib})" + cp -af "${lib}" "${BUILD_DIR}/bin/" + done +} + # Keep the local tree clean after we're done with the build cleanup() { (cd ${LLAMACPP_DIR}/ && git checkout CMakeLists.txt) diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index db2c6c30..70fc0313 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -51,7 +51,7 @@ if [ -z "${CUDACXX}" ]; then export CUDACXX=$(command -v nvcc) fi fi -COMMON_CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off" +COMMON_CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off" source $(dirname $0)/gen_common.sh init_vars git_module_setup @@ -77,10 +77,11 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then init_vars echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\"" - CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}" + CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}" BUILD_DIR="../build/linux/${ARCH}/cpu" echo "Building custom CPU" build + install compress else # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512 @@ -93,7 +94,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then # -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake # -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake - COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off" + COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off" if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then # # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) @@ -103,6 +104,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then BUILD_DIR="../build/linux/${ARCH}/cpu" echo "Building LCD CPU" build + install compress fi @@ -120,6 +122,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then BUILD_DIR="../build/linux/${ARCH}/cpu_avx" echo "Building AVX CPU" build + install compress fi @@ -133,6 +136,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then BUILD_DIR="../build/linux/${ARCH}/cpu_avx2" echo "Building AVX2 CPU" build + install compress fi fi @@ -178,29 +182,18 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}" echo "Building custom CUDA GPU" else - CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" + CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" fi - CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}" + export CUDAFLAGS="-t8" + CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off" BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}" - EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" + export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" + CUDA_DIST_DIR="${DIST_BASE}/ollama_libs" build - - # Carry the CUDA libs as payloads to help reduce dependency burden on users - # - # TODO - in the future we may shift to packaging these separately and conditionally - # downloading them in the install script. - DEPS="$(ldd ${BUILD_DIR}/bin/ollama_llama_server )" - for lib in libcudart.so libcublas.so libcublasLt.so ; do - DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true) - if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then - cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/bin/" - elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then - cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/bin/" - elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then - cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/bin/" - else - cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/bin/" - fi + install + mkdir -p "${CUDA_DIST_DIR}" + for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do + cp -a "${lib}" "${CUDA_DIST_DIR}" done compress @@ -218,21 +211,24 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then CC=icx CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF" BUILD_DIR="../build/linux/${ARCH}/oneapi" - EXTRA_LIBS="-fsycl -Wl,-rpath,${ONEAPI_ROOT}/compiler/latest/lib,-rpath,${ONEAPI_ROOT}/mkl/latest/lib,-rpath,${ONEAPI_ROOT}/tbb/latest/lib,-rpath,${ONEAPI_ROOT}/compiler/latest/opt/oclfpga/linux64/lib -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb" + ONEAPI_DIST_DIR="${DIST_BASE}/ollama_libs" + export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb" DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it build # copy oneAPI dependencies + mkdir -p "${ONEAPI_DIST_DIR}" for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do - cp "${dep}" "${BUILD_DIR}/bin/" + cp -a "${dep}" "${ONEAPI_DIST_DIR}" done - cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${BUILD_DIR}/bin/" - cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${BUILD_DIR}/bin/" - cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${BUILD_DIR}/bin/" - cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${BUILD_DIR}/bin/" - cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${BUILD_DIR}/bin/" - cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${BUILD_DIR}/bin/" - cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${BUILD_DIR}/bin/" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${ONEAPI_DIST_DIR}" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${ONEAPI_DIST_DIR}" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${ONEAPI_DIST_DIR}" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${ONEAPI_DIST_DIR}" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${ONEAPI_DIST_DIR}" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}" + install compress fi @@ -262,21 +258,18 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then echo "Building custom ROCM GPU" fi BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}" - EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu" + ROCM_DIST_DIR="${DIST_BASE}/ollama_libs" + # TODO figure out how to disable runpath (rpath) + # export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work + export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu" build - # Record the ROCM dependencies - rm -f "${BUILD_DIR}/bin/deps.txt" - touch "${BUILD_DIR}/bin/deps.txt" - for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do - echo "${dep}" >> "${BUILD_DIR}/bin/deps.txt" + # copy the ROCM dependencies + mkdir -p "${ROCM_DIST_DIR}" + for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do + cp -a "${dep}"* "${ROCM_DIST_DIR}" done - # bomb out if for some reason we didn't get a few deps - if [ $(cat "${BUILD_DIR}/bin/deps.txt" | wc -l ) -lt 8 ] ; then - cat "${BUILD_DIR}/bin/deps.txt" - echo "ERROR: deps file short" - exit 1 - fi + install compress fi diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index d8bce92d..1f8c96d8 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -286,12 +286,11 @@ function build_cuda() { sign install - rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" - md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null - write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" - cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" - cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" - cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" + md "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" -ea 0 > $null + write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" } else { write-host "Skipping CUDA generation step" } @@ -325,18 +324,17 @@ function build_oneapi() { sign install - rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" + md "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" -ea 0 > $null + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" } else { Write-Host "Skipping oneAPI generation step" } @@ -386,12 +384,11 @@ function build_rocm() { sign install - rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\" - md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null - cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\" - cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\" + md "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\rocblas\library\" -ea 0 > $null + cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" # amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs - cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" + cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\rocblas\library\" } else { write-host "Skipping ROCm generation step" } diff --git a/llm/server.go b/llm/server.go index d2b8db9b..9347a458 100644 --- a/llm/server.go +++ b/llm/server.go @@ -306,20 +306,18 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr if runtime.GOOS == "windows" { pathEnv = "PATH" } - // prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies - libraryPaths := []string{dir, filepath.Dir(dir)} + // Start with the server directory for the LD_LIBRARY_PATH/PATH + libraryPaths := []string{dir} if libraryPath, ok := os.LookupEnv(pathEnv); ok { - // Append our runner directory to the path - // This will favor system libraries over our bundled library dependencies + // favor our bundled library dependencies over system libraries libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...) } // Note: we always put the dependency path first - // since this was the exact version we verified for AMD GPUs - // and we favor what the user had in their path + // since this was the exact version we compiled/linked against if gpus[0].DependencyPath != "" { - // TODO refine for multi-gpu support + // assume gpus from the same library have the same dependency path libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...) } diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh index 27c4ff1f..4ea51229 100755 --- a/scripts/build_linux.sh +++ b/scripts/build_linux.sh @@ -21,11 +21,9 @@ for TARGETARCH in ${BUILD_ARCH}; do -t builder:$TARGETARCH \ . docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH - docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/ollama ./dist/ollama-linux-$TARGETARCH - - if [ "$TARGETARCH" = "amd64" ]; then - docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/deps/ ./dist/ - fi - + docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist docker rm builder-$TARGETARCH + echo "Compressing final linux bundle..." + rm -f ./dist/ollama-linux-$TARGETARCH.tgz + (cd dist/linux-$TARGETARCH && tar cf - . | gzip --best > ../ollama-linux-$TARGETARCH.tgz ) done diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1 index edc73759..e8d851f4 100644 --- a/scripts/build_windows.ps1 +++ b/scripts/build_windows.ps1 @@ -103,22 +103,22 @@ function buildApp() { function gatherDependencies() { write-host "Gathering runtime dependencies" cd "${script:SRC_DIR}" - md "${script:DEPS_DIR}\ollama_runners" -ea 0 > $null + md "${script:DEPS_DIR}\ollama_libs" -ea 0 > $null # TODO - this varies based on host build system and MSVC version - drive from dumpbin output # currently works for Win11 + MSVC 2019 + Cuda V11 - cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\ollama_runners\" - cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_runners\" - cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_runners\" + cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\ollama_libs\" + cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_libs\" + cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_libs\" foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) { - cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\ollama_runners\" + cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\ollama_libs\" } cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\" if ("${env:KEY_CONTAINER}") { write-host "about to sign" - foreach ($file in (get-childitem "${script:DEPS_DIR}\cuda\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){ + foreach ($file in (get-childitem "${script:DEPS_DIR}\ollama_libs\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){ write-host "signing $file" & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" ` /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file diff --git a/scripts/install.sh b/scripts/install.sh index 03af5a69..f0439b00 100644 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -63,16 +63,32 @@ if [ -n "$NEEDS" ]; then exit 1 fi -status "Downloading ollama..." -curl --fail --show-error --location --progress-bar -o $TEMP_DIR/ollama "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}" - for BINDIR in /usr/local/bin /usr/bin /bin; do echo $PATH | grep -q $BINDIR && break || continue done +OLLAMA_INSTALL_DIR=${OLLAMA_INSTALL_DIR:-${BINDIR}} -status "Installing ollama to $BINDIR..." +status "Installing ollama to $OLLAMA_INSTALL_DIR" $SUDO install -o0 -g0 -m755 -d $BINDIR -$SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $BINDIR/ollama +$SUDO install -o0 -g0 -m755 -d "$OLLAMA_INSTALL_DIR" +if curl -I --silent --fail --location "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" >/dev/null ; then + status "Downloading Linux ${ARCH} bundle" + curl --fail --show-error --location --progress-bar \ + "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" | \ + $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR" + BUNDLE=1 +else + status "Downloading Linux ${ARCH} CLI" + curl --fail --show-error --location --progress-bar -o "$TEMP_DIR/ollama"\ + "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}" + $SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $OLLAMA_INSTALL_DIR/ollama + BUNDLE=0 +fi + +if [ "$OLLAMA_INSTALL_DIR/ollama" != "$BINDIR/ollama" ] ; then + status "Making ollama accessible in the PATH in $BINDIR" + $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama" +fi install_success() { status 'The Ollama API is now available at 127.0.0.1:11434.' @@ -178,6 +194,11 @@ if ! check_gpu lspci nvidia && ! check_gpu lshw nvidia && ! check_gpu lspci amdg fi if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then + if [ $BUNDLE -ne 0 ]; then + install_success + status "AMD GPU ready." + exit 0 + fi # Look for pre-existing ROCm v6 before downloading the dependencies for search in "${HIP_PATH:-''}" "${ROCM_PATH:-''}" "/opt/rocm" "/usr/lib64"; do if [ -n "${search}" ] && [ -e "${search}/libhipblas.so.2" -o -e "${search}/lib/libhipblas.so.2" ]; then From c7bcb0031965e33531358639620a11516d101b54 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 9 Aug 2024 07:21:40 -0700 Subject: [PATCH 54/71] Wire up ccache and pigz in the docker based build This should help speed things up a little --- Dockerfile | 37 ++++++++++++++++++++++++++----------- llm/generate/gen_common.sh | 15 +++++++++------ llm/generate/gen_darwin.sh | 2 ++ llm/generate/gen_linux.sh | 2 ++ scripts/build_linux.sh | 3 ++- scripts/rh_linux_deps.sh | 14 ++++++++++++-- 6 files changed, 53 insertions(+), 20 deletions(-) diff --git a/Dockerfile b/Dockerfile index 120ddc21..8eb90057 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,7 +19,8 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ENV GOARCH amd64 -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64 ARG CMAKE_VERSION @@ -30,7 +31,12 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ENV GOARCH arm64 -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 \ + OLLAMA_SKIP_CPU_GENERATE=1 \ + CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \ + CUDA_VARIANT="_v11" \ + bash gen_linux.sh FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64 ARG CMAKE_VERSION @@ -43,7 +49,8 @@ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ARG AMDGPU_TARGETS ENV GOARCH amd64 -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh RUN mkdir -p ../../dist/linux-amd64/ollama_libs && \ (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64/ollama_libs && tar xf - ) @@ -60,13 +67,17 @@ ENV GOARCH amd64 WORKDIR /go/src/github.com/ollama/ollama/llm/generate FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64 -RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_CPU_TARGET="static" bash gen_linux.sh FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64 -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64 -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" bash gen_linux.sh FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64 -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" bash gen_linux.sh FROM --platform=linux/arm64 rockylinux:8 AS cpu-builder-arm64 ARG CMAKE_VERSION @@ -81,9 +92,11 @@ ENV GOARCH arm64 WORKDIR /go/src/github.com/ollama/ollama/llm/generate FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64 -RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_CPU_TARGET="static" bash gen_linux.sh FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64 -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh # Intermediate stage used for ./scripts/build_linux.sh @@ -100,7 +113,8 @@ COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ ARG GOFLAGS ARG CGO_CFLAGS -RUN go build -trimpath -o dist/linux-amd64/ollama . +RUN --mount=type=cache,target=/root/.ccache \ + go build -trimpath -o dist/linux-amd64/ollama . # Intermediate stage used for ./scripts/build_linux.sh FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64 @@ -113,7 +127,8 @@ COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ ARG GOFLAGS ARG CGO_CFLAGS -RUN go build -trimpath -o dist/linux-arm64/ollama . +RUN --mount=type=cache,target=/root/.ccache \ + go build -trimpath -o dist/linux-arm64/ollama . # Runtime stages FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64 diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh index f1541f2a..40115936 100644 --- a/llm/generate/gen_common.sh +++ b/llm/generate/gen_common.sh @@ -47,6 +47,7 @@ init_vars() { if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80" fi + GZIP=$(which pigz 2>/dev/null || echo "gzip") } git_module_setup() { @@ -90,21 +91,23 @@ build() { compress() { echo "Compressing payloads to reduce overall binary size..." - pids="" rm -rf ${BUILD_DIR}/bin/*.gz for f in ${BUILD_DIR}/bin/* ; do - gzip -n --best -f ${f} & - pids+=" $!" + ${GZIP} -n --best -f ${f} & + compress_pids+=" $!" done # check for lib directory if [ -d ${BUILD_DIR}/lib ]; then for f in ${BUILD_DIR}/lib/* ; do - gzip -n --best -f ${f} & - pids+=" $!" + ${GZIP} -n --best -f ${f} & + compress_pids+=" $!" done fi echo - for pid in ${pids}; do +} + +wait_for_compress() { + for pid in ${compress_pids}; do wait $pid done echo "Finished compression" diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh index 6c0b62cb..f22c0f8e 100755 --- a/llm/generate/gen_darwin.sh +++ b/llm/generate/gen_darwin.sh @@ -6,6 +6,7 @@ set -ex set -o pipefail +compress_pids="" echo "Starting darwin generate script" source $(dirname $0)/gen_common.sh init_vars @@ -98,4 +99,5 @@ case "${GOARCH}" in esac cleanup +wait_for_compress echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)" diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index 70fc0313..1365d07d 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -13,6 +13,7 @@ set -ex set -o pipefail +compress_pids="" # See https://llvm.org/docs/AMDGPUUsage.html#processors for reference amdGPUs() { @@ -274,4 +275,5 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then fi cleanup +wait_for_compress echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)" diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh index 4ea51229..ebb60c5a 100755 --- a/scripts/build_linux.sh +++ b/scripts/build_linux.sh @@ -4,6 +4,7 @@ set -eu export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")} export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'" +GZIP=$(which pigz 2>/dev/null || echo "gzip") BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"} export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""} @@ -25,5 +26,5 @@ for TARGETARCH in ${BUILD_ARCH}; do docker rm builder-$TARGETARCH echo "Compressing final linux bundle..." rm -f ./dist/ollama-linux-$TARGETARCH.tgz - (cd dist/linux-$TARGETARCH && tar cf - . | gzip --best > ../ollama-linux-$TARGETARCH.tgz ) + (cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz ) done diff --git a/scripts/rh_linux_deps.sh b/scripts/rh_linux_deps.sh index 81648d68..b4c9afd6 100644 --- a/scripts/rh_linux_deps.sh +++ b/scripts/rh_linux_deps.sh @@ -3,6 +3,7 @@ # Script for common Dockerfile dependency installation in redhat linux based images set -ex +set -o pipefail MACHINE=$(uname -m) if grep -i "centos" /etc/system-release >/dev/null; then @@ -29,7 +30,7 @@ if grep -i "centos" /etc/system-release >/dev/null; then dnf install -y rh-git227-git ln -s /opt/rh/rh-git227/root/usr/bin/git /usr/local/bin/git fi - dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++ + dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++ pigz elif grep -i "rocky" /etc/system-release >/dev/null; then # Temporary workaround until rocky 8 AppStream ships GCC 10.4 (10.3 is incompatible with NVCC) cat << EOF > /etc/yum.repos.d/Rocky-Vault.repo @@ -43,12 +44,21 @@ gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-rockyofficial EOF dnf install -y git \ gcc-toolset-10-gcc-10.2.1-8.2.el8 \ - gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 + gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 \ + pigz else echo "ERROR Unexpected distro" exit 1 fi +if [ "${MACHINE}" = "x86_64" ] ; then + curl -s -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz | tar -Jx -C /tmp --strip-components 1 && \ + mv /tmp/ccache /usr/local/bin/ +else + yum -y install epel-release + yum install -y ccache +fi + if [ -n "${CMAKE_VERSION}" ]; then curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1 fi From d470ebe78bc76c098bc378f98f08f7094063ab4d Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 30 May 2024 21:54:07 -0700 Subject: [PATCH 55/71] Add Jetson cuda variants for arm This adds new variants for arm64 specific to Jetson platforms --- Dockerfile | 48 +++++++++++++++++++++++++++++++++++---- gpu/gpu.go | 44 +++++++++++++++++++++++++++++++++-- gpu/gpu_darwin.go | 4 ++-- gpu/types.go | 6 ++--- llm/generate/gen_linux.sh | 5 ++-- llm/payload.go | 4 ++-- scripts/build_linux.sh | 1 + 7 files changed, 96 insertions(+), 16 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8eb90057..79b2a696 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,6 +3,9 @@ ARG CMAKE_VERSION=3.22.1 # this CUDA_VERSION corresponds with the one specified in docs/gpu.md ARG CUDA_VERSION=11.3.1 ARG ROCM_VERSION=6.1.2 +ARG JETPACK_6=r36.2.0 +ARG JETPACK_5=r35.4.1 +ARG JETPACK_4=r32.7.1 # Copy the minimal context we need to run the generate scripts FROM scratch AS llm-code @@ -22,7 +25,7 @@ ENV GOARCH amd64 RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh -FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64 +FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-server-arm64 ARG CMAKE_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh @@ -31,11 +34,40 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ENV GOARCH arm64 +RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh + +FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS cuda-build-jetpack6-arm64 +ARG CMAKE_VERSION +RUN apt-get update && apt-get install -y git curl && \ + curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1 +COPY --from=llm-code / /go/src/github.com/ollama/ollama/ +WORKDIR /go/src/github.com/ollama/ollama/llm/generate +ARG CGO_CFLAGS +ENV GOARCH arm64 +ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_SKIP_STATIC_GENERATE=1 \ OLLAMA_SKIP_CPU_GENERATE=1 \ - CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \ - CUDA_VARIANT="_v11" \ + CUDA_VARIANT="_jetpack6" \ + CUDA_DIST_DIR="/go/src/github.com/ollama/ollama/dist/linux-arm64/ollama_libs/cuda_jetpack6" \ + CMAKE_CUDA_ARCHITECTURES="87" \ + bash gen_linux.sh + +FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS cuda-build-jetpack5-arm64 +ARG CMAKE_VERSION +RUN apt-get update && apt-get install -y git curl && \ + curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1 +COPY --from=llm-code / /go/src/github.com/ollama/ollama/ +WORKDIR /go/src/github.com/ollama/ollama/llm/generate +ARG CGO_CFLAGS +ENV GOARCH arm64 +ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 \ + OLLAMA_SKIP_CPU_GENERATE=1 \ + CUDA_VARIANT="_jetpack5" \ + CUDA_DIST_DIR="/go/src/github.com/ollama/ollama/dist/linux-arm64/ollama_libs/cuda_jetpack5" \ + CMAKE_CUDA_ARCHITECTURES="72;87" \ bash gen_linux.sh FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64 @@ -123,8 +155,14 @@ ARG GOLANG_VERSION WORKDIR /go/src/github.com/ollama/ollama COPY . . COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ -COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ +COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +## arm binary += 381M +COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ +## arm binary += 330M +COPY --from=cuda-build-jetpack5-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-build-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ ARG GOFLAGS ARG CGO_CFLAGS RUN --mount=type=cache,target=/root/.ccache \ diff --git a/gpu/gpu.go b/gpu/gpu.go index d0ae0f34..22461922 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -15,7 +15,9 @@ import ( "log/slog" "os" "path/filepath" + "regexp" "runtime" + "strconv" "strings" "sync" "unsafe" @@ -215,7 +217,7 @@ func GetGPUInfo() GpuInfoList { GpuInfo: GpuInfo{ memInfo: mem, Library: "cpu", - Variant: cpuCapability, + Variant: cpuCapability.String(), ID: "0", }, }, @@ -231,6 +233,35 @@ func GetGPUInfo() GpuInfoList { depPath := GetDepDir() + var cudaVariant string + if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" { + if CudaTegra != "" { + ver := strings.Split(CudaTegra, ".") + if len(ver) > 0 { + cudaVariant = "jetpack" + ver[0] + } + } else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil { + r := regexp.MustCompile(` R(\d+) `) + m := r.FindSubmatch(data) + if len(m) != 2 { + slog.Info("Unexpected format for /etc/nv_tegra_release. Set JETSON_JETPACK to select version") + } else { + if l4t, err := strconv.Atoi(string(m[1])); err == nil { + // Note: mapping from L4t -> JP is inconsistent (can't just subtract 30) + // https://developer.nvidia.com/embedded/jetpack-archive + switch l4t { + case 35: + cudaVariant = "jetpack5" + case 36: + cudaVariant = "jetpack6" + default: + slog.Info("unsupported L4T version", "nv_tegra_release", string(data)) + } + } + } + } + } + // Load ALL libraries cHandles = initCudaHandles() @@ -240,6 +271,7 @@ func GetGPUInfo() GpuInfoList { gpuInfo := CudaGPUInfo{ GpuInfo: GpuInfo{ Library: "cuda", + Variant: cudaVariant, }, index: i, } @@ -266,7 +298,15 @@ func GetGPUInfo() GpuInfoList { gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor) gpuInfo.MinimumMemory = cudaMinimumMemory - gpuInfo.DependencyPath = depPath + if depPath != "" { + gpuInfo.DependencyPath = depPath + // Check for variant specific directory + if cudaVariant != "" { + if _, err := os.Stat(filepath.Join(depPath, "cuda_"+cudaVariant)); err == nil { + gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+cudaVariant) + } + } + } gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) gpuInfo.DriverMajor = driverMajor gpuInfo.DriverMinor = driverMinor diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index 9d9fd84e..417b48df 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -25,7 +25,7 @@ func GetGPUInfo() GpuInfoList { return []GpuInfo{ { Library: "cpu", - Variant: GetCPUCapability(), + Variant: GetCPUCapability().String(), memInfo: mem, }, } @@ -48,7 +48,7 @@ func GetCPUInfo() GpuInfoList { return []GpuInfo{ { Library: "cpu", - Variant: GetCPUCapability(), + Variant: GetCPUCapability().String(), memInfo: mem, }, } diff --git a/gpu/types.go b/gpu/types.go index 8d22b06b..fc628d47 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -19,7 +19,7 @@ type GpuInfo struct { Library string `json:"library,omitempty"` // Optional variant to select (e.g. versions, cpu feature flags) - Variant CPUCapability `json:"variant"` + Variant string `json:"variant"` // MinimumMemory represents the minimum memory required to use the GPU MinimumMemory uint64 `json:"-"` @@ -81,8 +81,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList { for _, info := range l { found := false requested := info.Library - if info.Variant != CPUCapabilityNone { - requested += "_" + info.Variant.String() + if info.Variant != CPUCapabilityNone.String() { + requested += "_" + info.Variant } for i, lib := range libs { if lib == requested { diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index 1365d07d..dc9dda5a 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -165,7 +165,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then echo "CUDA libraries detected - building dynamic CUDA library" init_vars CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true) - if [ -n "${CUDA_MAJOR}" ]; then + if [ -n "${CUDA_MAJOR}" -a -z "${CUDA_VARIANT}" ]; then CUDA_VARIANT=_v${CUDA_MAJOR} fi if [ "${ARCH}" == "arm64" ]; then @@ -189,9 +189,10 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off" BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}" export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" - CUDA_DIST_DIR="${DIST_BASE}/ollama_libs" + CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/ollama_libs}" build install + echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}" mkdir -p "${CUDA_DIST_DIR}" for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do cp -a "${lib}" "${CUDA_DIST_DIR}" diff --git a/llm/payload.go b/llm/payload.go index b402e1f2..963b3295 100644 --- a/llm/payload.go +++ b/llm/payload.go @@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string { // glob workDir for files that start with ollama_ availableServers := getAvailableServers() requested := info.Library - if info.Variant != gpu.CPUCapabilityNone { - requested += "_" + info.Variant.String() + if info.Variant != gpu.CPUCapabilityNone.String() { + requested += "_" + info.Variant } servers := []string{} diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh index ebb60c5a..adda2ad7 100755 --- a/scripts/build_linux.sh +++ b/scripts/build_linux.sh @@ -22,6 +22,7 @@ for TARGETARCH in ${BUILD_ARCH}; do -t builder:$TARGETARCH \ . docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH + rm -rf ./dist/linux-$TARGETARCH docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist docker rm builder-$TARGETARCH echo "Compressing final linux bundle..." From fc3b4cda89f468f923e2e6095c6a62a5c3c336ff Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 19 Jun 2024 09:36:30 -0700 Subject: [PATCH 56/71] Report GPU variant in log --- gpu/types.go | 1 + 1 file changed, 1 insertion(+) diff --git a/gpu/types.go b/gpu/types.go index fc628d47..88539078 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -105,6 +105,7 @@ func (l GpuInfoList) LogDetails() { slog.Info("inference compute", "id", g.ID, "library", g.Library, + "variant", g.Variant, "compute", g.Compute, "driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor), "name", g.Name, From 4fe3a556faf790ba993223cfdda16e281b6cb76d Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 13 Jun 2024 20:46:14 -0700 Subject: [PATCH 57/71] Add cuda v12 variant and selection logic Based on compute capability and driver version, pick v12 or v11 cuda variants. --- Dockerfile | 43 +++++++++++++++++++++++++++++++++---------- gpu/cuda_common.go | 43 +++++++++++++++++++++++++++++++++++++++++++ gpu/gpu.go | 40 ++++------------------------------------ gpu/types.go | 6 ++++-- 4 files changed, 84 insertions(+), 48 deletions(-) diff --git a/Dockerfile b/Dockerfile index 79b2a696..e200f5d4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ ARG GOLANG_VERSION=1.22.5 ARG CMAKE_VERSION=3.22.1 -# this CUDA_VERSION corresponds with the one specified in docs/gpu.md -ARG CUDA_VERSION=11.3.1 +ARG CUDA_VERSION_11=11.3.1 +ARG CUDA_VERSION_12=12.4.0 ARG ROCM_VERSION=6.1.2 ARG JETPACK_6=r36.2.0 ARG JETPACK_5=r35.4.1 @@ -13,7 +13,7 @@ COPY .git .git COPY .gitmodules .gitmodules COPY llm llm -FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64 +FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_11-devel-centos7 AS cuda-11-build-amd64 ARG CMAKE_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh @@ -23,9 +23,29 @@ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ENV GOARCH amd64 RUN --mount=type=cache,target=/root/.ccache \ - OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh + OLLAMA_SKIP_STATIC_GENERATE=1 \ + OLLAMA_SKIP_CPU_GENERATE=1 \ + CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86" \ + CUDA_VARIANT="_v11" \ + bash gen_linux.sh -FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-server-arm64 +FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_12-devel-centos7 AS cuda-12-build-amd64 +ARG CMAKE_VERSION +COPY ./scripts/rh_linux_deps.sh / +RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh +ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH +COPY --from=llm-code / /go/src/github.com/ollama/ollama/ +WORKDIR /go/src/github.com/ollama/ollama/llm/generate +ARG CGO_CFLAGS +ENV GOARCH amd64 +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 \ + OLLAMA_SKIP_CPU_GENERATE=1 \ + CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a" \ + CUDA_VARIANT="_v12" \ + bash gen_linux.sh + +FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64 ARG CMAKE_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh @@ -34,7 +54,8 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ENV GOARCH arm64 -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS cuda-build-jetpack6-arm64 ARG CMAKE_VERSION @@ -139,8 +160,10 @@ COPY . . COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ -COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ +COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ +COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ ARG GOFLAGS @@ -155,8 +178,8 @@ ARG GOLANG_VERSION WORKDIR /go/src/github.com/ollama/ollama COPY . . COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ -COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ +COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ ## arm binary += 381M COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ diff --git a/gpu/cuda_common.go b/gpu/cuda_common.go index c90a644c..defaa60a 100644 --- a/gpu/cuda_common.go +++ b/gpu/cuda_common.go @@ -4,9 +4,17 @@ package gpu import ( "log/slog" + "os" + "regexp" + "runtime" + "strconv" "strings" ) +// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed. +// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices. +var CudaTegra string = os.Getenv("JETSON_JETPACK") + func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) { ids := []string{} for _, info := range gpuInfo { @@ -19,3 +27,38 @@ func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) { } return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",") } + +func cudaGetVariant(gpuInfo CudaGPUInfo) string { + if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" { + if CudaTegra != "" { + ver := strings.Split(CudaTegra, ".") + if len(ver) > 0 { + return "jetpack" + ver[0] + } + } else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil { + r := regexp.MustCompile(` R(\d+) `) + m := r.FindSubmatch(data) + if len(m) != 2 { + slog.Info("Unexpected format for /etc/nv_tegra_release. Set JETSON_JETPACK to select version") + } else { + if l4t, err := strconv.Atoi(string(m[1])); err == nil { + // Note: mapping from L4t -> JP is inconsistent (can't just subtract 30) + // https://developer.nvidia.com/embedded/jetpack-archive + switch l4t { + case 35: + return "jetpack5" + case 36: + return "jetpack6" + default: + slog.Info("unsupported L4T version", "nv_tegra_release", string(data)) + } + } + } + } + } + + if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 { + return "v11" + } + return "v12" +} diff --git a/gpu/gpu.go b/gpu/gpu.go index 22461922..eb87807a 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -15,9 +15,7 @@ import ( "log/slog" "os" "path/filepath" - "regexp" "runtime" - "strconv" "strings" "sync" "unsafe" @@ -66,10 +64,6 @@ var RocmComputeMin = 9 // TODO find a better way to detect iGPU instead of minimum memory const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU -// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed. -// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices. -var CudaTegra string = os.Getenv("JETSON_JETPACK") - // Note: gpuMutex must already be held func initCudaHandles() *cudaHandles { // TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing @@ -233,35 +227,6 @@ func GetGPUInfo() GpuInfoList { depPath := GetDepDir() - var cudaVariant string - if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" { - if CudaTegra != "" { - ver := strings.Split(CudaTegra, ".") - if len(ver) > 0 { - cudaVariant = "jetpack" + ver[0] - } - } else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil { - r := regexp.MustCompile(` R(\d+) `) - m := r.FindSubmatch(data) - if len(m) != 2 { - slog.Info("Unexpected format for /etc/nv_tegra_release. Set JETSON_JETPACK to select version") - } else { - if l4t, err := strconv.Atoi(string(m[1])); err == nil { - // Note: mapping from L4t -> JP is inconsistent (can't just subtract 30) - // https://developer.nvidia.com/embedded/jetpack-archive - switch l4t { - case 35: - cudaVariant = "jetpack5" - case 36: - cudaVariant = "jetpack6" - default: - slog.Info("unsupported L4T version", "nv_tegra_release", string(data)) - } - } - } - } - } - // Load ALL libraries cHandles = initCudaHandles() @@ -271,7 +236,6 @@ func GetGPUInfo() GpuInfoList { gpuInfo := CudaGPUInfo{ GpuInfo: GpuInfo{ Library: "cuda", - Variant: cudaVariant, }, index: i, } @@ -297,7 +261,10 @@ func GetGPUInfo() GpuInfoList { gpuInfo.FreeMemory = uint64(memInfo.free) gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor) + gpuInfo.computeMajor = int(memInfo.major) + gpuInfo.computeMinor = int(memInfo.minor) gpuInfo.MinimumMemory = cudaMinimumMemory + cudaVariant := cudaGetVariant(gpuInfo) if depPath != "" { gpuInfo.DependencyPath = depPath // Check for variant specific directory @@ -310,6 +277,7 @@ func GetGPUInfo() GpuInfoList { gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) gpuInfo.DriverMajor = driverMajor gpuInfo.DriverMinor = driverMinor + gpuInfo.Variant = cudaGetVariant(gpuInfo) // query the management library as well so we can record any skew between the two // which represents overhead on the GPU we must set aside on subsequent updates diff --git a/gpu/types.go b/gpu/types.go index 88539078..4cbbeb84 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -53,8 +53,10 @@ type CPUInfo struct { type CudaGPUInfo struct { GpuInfo - OSOverhead uint64 // Memory overhead between the driver library and management library - index int //nolint:unused,nolintlint + OSOverhead uint64 // Memory overhead between the driver library and management library + index int //nolint:unused,nolintlint + computeMajor int //nolint:unused,nolintlint + computeMinor int //nolint:unused,nolintlint } type CudaGPUInfoList []CudaGPUInfo From f6c811b32075cb3b7633d7d4213251d474a77682 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 12 Jul 2024 11:35:41 -0700 Subject: [PATCH 58/71] Enable cuda v12 flags --- Dockerfile | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index e200f5d4..e83a266a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,9 @@ ARG GOLANG_VERSION=1.22.5 ARG CMAKE_VERSION=3.22.1 ARG CUDA_VERSION_11=11.3.1 +ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86" ARG CUDA_VERSION_12=12.4.0 +ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a" ARG ROCM_VERSION=6.1.2 ARG JETPACK_6=r36.2.0 ARG JETPACK_5=r35.4.1 @@ -21,11 +23,12 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS +ARG CUDA_V11_ARCHITECTURES ENV GOARCH amd64 RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_SKIP_STATIC_GENERATE=1 \ OLLAMA_SKIP_CPU_GENERATE=1 \ - CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86" \ + CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \ CUDA_VARIANT="_v11" \ bash gen_linux.sh @@ -37,12 +40,14 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS +ARG CUDA_V12_ARCHITECTURES ENV GOARCH amd64 RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_SKIP_STATIC_GENERATE=1 \ OLLAMA_SKIP_CPU_GENERATE=1 \ - CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a" \ + CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \ CUDA_VARIANT="_v12" \ + OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \ bash gen_linux.sh FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64 @@ -53,9 +58,31 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS +ARG CUDA_V11_ARCHITECTURES +ENV GOARCH arm64 +RUN OLLAMA_SKIP_STATIC_GENERATE=1 \ + OLLAMA_SKIP_CPU_GENERATE=1 \ + CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \ + CUDA_VARIANT="_v11" \ + bash gen_linux.sh + +FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-server-arm64 +ARG CMAKE_VERSION +COPY ./scripts/rh_linux_deps.sh / +RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh +ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH +COPY --from=llm-code / /go/src/github.com/ollama/ollama/ +WORKDIR /go/src/github.com/ollama/ollama/llm/generate +ARG CGO_CFLAGS +ARG CUDA_V12_ARCHITECTURES ENV GOARCH arm64 RUN --mount=type=cache,target=/root/.ccache \ - OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh + OLLAMA_SKIP_STATIC_GENERATE=1 \ + OLLAMA_SKIP_CPU_GENERATE=1 \ + CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \ + CUDA_VARIANT="_v12" \ + OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \ + bash gen_linux.sh FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS cuda-build-jetpack6-arm64 ARG CMAKE_VERSION @@ -180,6 +207,8 @@ COPY . . COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ +COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ ## arm binary += 381M COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ From 927d98a6cde43ffee3ef269cf013df5e96cbe767 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 12 Jul 2024 14:33:13 -0700 Subject: [PATCH 59/71] Add windows cuda v12 + v11 support --- .github/workflows/release.yaml | 93 ++++++++++++++++++++++++++++++++-- llm/generate/gen_windows.ps1 | 6 +-- scripts/build_windows.ps1 | 63 ++++++++++++++++++----- 3 files changed, 142 insertions(+), 20 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 9287f6f7..4bd68455 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -183,8 +183,8 @@ jobs: name: windows-rocm-deps path: dist/deps/* - # CUDA generation step - generate-windows-cuda: + # CUDA v11 generation step + generate-windows-cuda-v11: environment: release runs-on: windows env: @@ -256,7 +256,89 @@ jobs: cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\" - uses: actions/upload-artifact@v4 with: - name: generate-windows-cuda + name: generate-windows-cuda-v11 + path: | + llm/build/**/bin/* + dist/windows-amd64/** + - uses: actions/upload-artifact@v4 + with: + name: windows-cuda-deps + path: dist/deps/* + + # CUDA v12 generation step + generate-windows-cuda-v12: + environment: release + runs-on: windows + env: + KEY_CONTAINER: ${{ vars.KEY_CONTAINER }} + steps: + - uses: actions/checkout@v4 + - name: Set Version + shell: bash + run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV + - uses: 'google-github-actions/auth@v2' + with: + project_id: 'ollama' + credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}' + - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt + - name: install Windows SDK 8.1 to get signtool + run: | + $ErrorActionPreference = "Stop" + write-host "downloading SDK" + Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe" + Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait + write-host "Win SDK 8.1 installed" + gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe' + - name: install signing plugin + run: | + $ErrorActionPreference = "Stop" + write-host "downloading plugin" + Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip" + Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\ + write-host "Installing plugin" + & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet + write-host "plugin installed" + - uses: actions/setup-go@v5 + with: + go-version-file: go.mod + cache: true + - name: 'Install CUDA' + run: | + $ErrorActionPreference = "Stop" + write-host "downloading CUDA Installer" + Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe" + write-host "Installing CUDA" + Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait + write-host "Completed CUDA" + $cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path) + $cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2' + echo "$cudaPath\bin" >> $env:GITHUB_PATH + echo "CUDA_PATH=$cudaPath" >> $env:GITHUB_ENV + echo "CUDA_PATH_V${cudaVer}=$cudaPath" >> $env:GITHUB_ENV + echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV + - name: 'Verify CUDA' + run: nvcc -V + - run: go get ./... + - name: go generate + run: | + $gopath=(get-command go).source | split-path -parent + $cudabin=(get-command nvcc).source | split-path + & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1" + cd $env:GITHUB_WORKSPACE + $env:CMAKE_SYSTEM_VERSION="10.0.22621.0" + $env:PATH="$gopath;$cudabin;$env:PATH" + $env:OLLAMA_SKIP_CPU_GENERATE="1" + go generate -x ./... + - name: 'gather cuda dependencies' + run: | + $NVIDIA_DIR=(resolve-path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*\bin\')[0] + md "dist\deps" + cp "${NVIDIA_DIR}\cudart64_*.dll" "dist\deps\" + cp "${NVIDIA_DIR}\cublas64_*.dll" "dist\deps\" + cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\" + - uses: actions/upload-artifact@v4 + with: + name: generate-windows-cuda-v12 path: | llm/build/**/bin/* dist/windows-amd64/** @@ -270,7 +352,8 @@ jobs: environment: release runs-on: windows needs: - - generate-windows-cuda + - generate-windows-cuda-v11 + - generate-windows-cuda-v12 - generate-windows-rocm - generate-windows-cpu env: @@ -314,7 +397,7 @@ jobs: name: generate-windows-cpu - uses: actions/download-artifact@v4 with: - name: generate-windows-cuda + name: generate-windows-cuda-v11 - uses: actions/download-artifact@v4 with: name: windows-cuda-deps diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index 1f8c96d8..42708d3e 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -261,7 +261,7 @@ function build_cuda() { if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) { # Then build cuda as a dynamically loaded library $nvcc = "$script:CUDA_LIB_DIR\nvcc.exe" - $script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename + $script:CUDA_VERSION=((get-item ($nvcc | split-path | split-path)).Basename -Split "\.")[0] if ($null -ne $script:CUDA_VERSION) { $script:CUDA_VARIANT="_"+$script:CUDA_VERSION } @@ -273,9 +273,9 @@ function build_cuda() { "-DGGML_CUDA=ON", "-DGGML_AVX=on", "-DGGML_AVX2=off", - "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_FLAGS=-t8", - "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}" + "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}", + "-DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=$env:CUDA_PATH" ) if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) { write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`"" diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1 index e8d851f4..50b60230 100644 --- a/scripts/build_windows.ps1 +++ b/scripts/build_windows.ps1 @@ -7,6 +7,7 @@ $ErrorActionPreference = "Stop" function checkEnv() { + $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower() $script:TARGET_ARCH=$Env:PROCESSOR_ARCHITECTURE.ToLower() Write-host "Building for ${script:TARGET_ARCH}" write-host "Locating required tools and paths" @@ -15,26 +16,23 @@ function checkEnv() { $MSVC_INSTALL=(Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation $env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0] } - # Try to find the CUDA dir - if ($null -eq $env:NVIDIA_DIR) { + # Locate CUDA versions + # Note: this assumes every version found will be built + $cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue') + if ($cudaList.length -eq 0) { $d=(get-command -ea 'silentlycontinue' nvcc).path - if ($d -ne $null) { - $script:NVIDIA_DIR=($d| split-path -parent) - } else { - $cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue') - if ($cudaList.length > 0) { - $script:NVIDIA_DIR=$cudaList[0] - } + if ($null -ne $d) { + $script:CUDA_DIRS=@($d| split-path -parent) } } else { - $script:NVIDIA_DIR=$env:NVIDIA_DIR + $script:CUDA_DIRS=$cudaList } $script:INNO_SETUP_DIR=(get-item "C:\Program Files*\Inno Setup*\")[0] $script:DEPS_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}" $env:CGO_ENABLED="1" - echo "Checking version" + Write-Output "Checking version" if (!$env:VERSION) { $data=(git describe --tags --first-parent --abbrev=7 --long --dirty --always) $pattern="v(.+)" @@ -71,7 +69,48 @@ function checkEnv() { function buildOllama() { write-host "Building ollama CLI" if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) { - & go generate ./... + Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}" + + # TODO - consider trying to parallelize this with Start-ThreadJob, but env vars can't be used to toggle + # which targets to build + + # Start by skipping CUDA to build everything else + pwsh -Command { $env:OLLAMA_SKIP_CUDA_GENERATE="1"; & go generate ./... } + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + + # Then skip everyhting else and build all the CUDA variants + foreach ($env:CUDA_LIB_DIR in $script:CUDA_DIRS) { + write-host "Building CUDA ${env:CUDA_LIB_DIR}" + + if ($env:CUDA_LIB_DIR.Contains("v12")) { + pwsh -Command { + $env:OLLAMA_SKIP_CUDA_GENERATE="" + $env:OLLAMA_SKIP_STATIC_GENERATE="1" + $env:OLLAMA_SKIP_CPU_GENERATE="1" + $env:OLLAMA_SKIP_ONEAPI_GENERATE="1" + $env:OLLAMA_SKIP_ROCM_GENERATE="1" + $env:CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a" + $env:OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" + $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent + $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH" + & go generate ./... + } + } else { + pwsh -Command { + $env:OLLAMA_SKIP_CUDA_GENERATE="" + $env:OLLAMA_SKIP_STATIC_GENERATE="1" + $env:OLLAMA_SKIP_CPU_GENERATE="1" + $env:OLLAMA_SKIP_ONEAPI_GENERATE="1" + $env:OLLAMA_SKIP_ROCM_GENERATE="1" + $env:CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86" + $env:OLLAMA_CUSTOM_CUDA_DEFS="" + $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent + $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH" + & go generate ./... + } + } + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + } if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} } else { write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set" From 3b19cdba2a090772b2e886dbfbf712992fafe0cd Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 13 Aug 2024 13:30:28 -0700 Subject: [PATCH 60/71] Remove Jetpack --- Dockerfile | 42 ------------------------------------------ 1 file changed, 42 deletions(-) diff --git a/Dockerfile b/Dockerfile index e83a266a..99ba5b65 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,9 +5,6 @@ ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86" ARG CUDA_VERSION_12=12.4.0 ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a" ARG ROCM_VERSION=6.1.2 -ARG JETPACK_6=r36.2.0 -ARG JETPACK_5=r35.4.1 -ARG JETPACK_4=r32.7.1 # Copy the minimal context we need to run the generate scripts FROM scratch AS llm-code @@ -84,39 +81,6 @@ RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \ bash gen_linux.sh -FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS cuda-build-jetpack6-arm64 -ARG CMAKE_VERSION -RUN apt-get update && apt-get install -y git curl && \ - curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1 -COPY --from=llm-code / /go/src/github.com/ollama/ollama/ -WORKDIR /go/src/github.com/ollama/ollama/llm/generate -ARG CGO_CFLAGS -ENV GOARCH arm64 -ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs -RUN --mount=type=cache,target=/root/.ccache \ - OLLAMA_SKIP_STATIC_GENERATE=1 \ - OLLAMA_SKIP_CPU_GENERATE=1 \ - CUDA_VARIANT="_jetpack6" \ - CUDA_DIST_DIR="/go/src/github.com/ollama/ollama/dist/linux-arm64/ollama_libs/cuda_jetpack6" \ - CMAKE_CUDA_ARCHITECTURES="87" \ - bash gen_linux.sh - -FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS cuda-build-jetpack5-arm64 -ARG CMAKE_VERSION -RUN apt-get update && apt-get install -y git curl && \ - curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1 -COPY --from=llm-code / /go/src/github.com/ollama/ollama/ -WORKDIR /go/src/github.com/ollama/ollama/llm/generate -ARG CGO_CFLAGS -ENV GOARCH arm64 -ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs -RUN --mount=type=cache,target=/root/.ccache \ - OLLAMA_SKIP_STATIC_GENERATE=1 \ - OLLAMA_SKIP_CPU_GENERATE=1 \ - CUDA_VARIANT="_jetpack5" \ - CUDA_DIST_DIR="/go/src/github.com/ollama/ollama/dist/linux-arm64/ollama_libs/cuda_jetpack5" \ - CMAKE_CUDA_ARCHITECTURES="72;87" \ - bash gen_linux.sh FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64 ARG CMAKE_VERSION @@ -209,12 +173,6 @@ COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ di COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -## arm binary += 381M -COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ -## arm binary += 330M -COPY --from=cuda-build-jetpack5-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=cuda-build-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ ARG GOFLAGS ARG CGO_CFLAGS RUN --mount=type=cache,target=/root/.ccache \ From 88bb9e332877dfbba40030c19570fdbe00f41a21 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 14 Aug 2024 16:32:57 -0700 Subject: [PATCH 61/71] Adjust layout to bin+lib/ollama --- Dockerfile | 23 ++++++++++++++------ app/ollama.iss | 12 +++++------ docs/linux.md | 10 ++++----- envconfig/config.go | 6 +++--- gpu/amd_common.go | 2 +- gpu/amd_windows.go | 2 +- gpu/gpu.go | 4 ++-- llm/generate/gen_linux.sh | 6 +++--- llm/generate/gen_windows.ps1 | 42 ++++++++++++++++++------------------ scripts/build_windows.ps1 | 16 +++++++------- scripts/install.sh | 14 +++++++----- 11 files changed, 74 insertions(+), 63 deletions(-) diff --git a/Dockerfile b/Dockerfile index 99ba5b65..d4b86918 100644 --- a/Dockerfile +++ b/Dockerfile @@ -95,8 +95,8 @@ ARG AMDGPU_TARGETS ENV GOARCH amd64 RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh -RUN mkdir -p ../../dist/linux-amd64/ollama_libs && \ - (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64/ollama_libs && tar xf - ) +RUN mkdir -p ../../dist/linux-amd64/lib/ollama && \ + (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64/lib/ollama && tar xf - ) FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64 ARG CMAKE_VERSION @@ -160,7 +160,7 @@ COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ l ARG GOFLAGS ARG CGO_CFLAGS RUN --mount=type=cache,target=/root/.ccache \ - go build -trimpath -o dist/linux-amd64/ollama . + go build -trimpath -o dist/linux-amd64/bin/ollama . # Intermediate stage used for ./scripts/build_linux.sh FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64 @@ -176,20 +176,29 @@ COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/llm/buil ARG GOFLAGS ARG CGO_CFLAGS RUN --mount=type=cache,target=/root/.ccache \ - go build -trimpath -o dist/linux-arm64/ollama . + go build -trimpath -o dist/linux-arm64/bin/ollama . + +# Strip out ROCm dependencies to keep the primary image lean +FROM --platform=linux/amd64 ubuntu:22.04 as amd64-libs-without-rocm +COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /scratch/ +RUN cd /scratch/ollama/ && rm -rf rocblas libamd* libdrm* libroc* libhip* libhsa* # Runtime stages FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64 +COPY --from=amd64-libs-without-rocm /scratch/ /lib/ RUN apt-get update && apt-get install -y ca-certificates -COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/ollama /bin/ollama +COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/ + FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64 +COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/ RUN apt-get update && apt-get install -y ca-certificates -COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/ollama /bin/ollama +COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/ # Radeon images are much larger so we keep it distinct from the CPU/CUDA image FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete as runtime-rocm RUN update-pciids -COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/ollama /bin/ollama +COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/ +RUN ln -s /opt/rocm/lib /lib/ollama EXPOSE 11434 ENV OLLAMA_HOST 0.0.0.0 diff --git a/app/ollama.iss b/app/ollama.iss index e9cf48ec..bce0a337 100644 --- a/app/ollama.iss +++ b/app/ollama.iss @@ -87,11 +87,11 @@ DialogFontSize=12 [Files] Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit -Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit -Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs +Source: "..\ollama.exe"; DestDir: "{app}\bin"; Flags: ignoreversion 64bit +Source: "..\dist\windows-{#ARCH}\lib\ollama\runners\*"; DestDir: "{app}\lib\ollama\runners"; Flags: ignoreversion 64bit recursesubdirs Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion -Source: "..\dist\windows-amd64\ollama_libs\*"; DestDir: "{app}\ollama_libs\"; Flags: ignoreversion recursesubdirs +Source: "..\dist\windows-amd64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Flags: ignoreversion recursesubdirs [Icons] Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico" @@ -99,7 +99,7 @@ Name: "{userstartup}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilen Name: "{userprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico" [Run] -Filename: "{cmd}"; Parameters: "/C set PATH={app};%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden +Filename: "{cmd}"; Parameters: "/C set PATH={app}\bin;%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden [UninstallRun] ; Filename: "{cmd}"; Parameters: "/C ""taskkill /im ''{#MyAppExeName}'' /f /t"; Flags: runhidden @@ -134,8 +134,8 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi [Registry] Root: HKCU; Subkey: "Environment"; \ - ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}"; \ - Check: NeedsAddPath('{app}') + ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}\bin"; \ + Check: NeedsAddPath('{app}\bin') [Code] diff --git a/docs/linux.md b/docs/linux.md index ec730656..3ed2bed0 100644 --- a/docs/linux.md +++ b/docs/linux.md @@ -20,13 +20,12 @@ GPU. ## Manual install -### Download the `ollama` binary +### Download the `ollama` tar file -Ollama is distributed as a self-contained binary. Download it to a directory in your PATH: +Ollama is distributed as a tar file including GPU library dependencies. ```bash -sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama -sudo chmod +x /usr/bin/ollama +curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar -C /usr -zxf - ``` ### Adding Ollama as a startup service (recommended) @@ -96,8 +95,7 @@ curl -fsSL https://ollama.com/install.sh | sh Or by downloading the ollama binary: ```bash -sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama -sudo chmod +x /usr/bin/ollama +curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar -C /usr -zxf - ``` ## Installing specific versions diff --git a/envconfig/config.go b/envconfig/config.go index 7f0976c0..7e45a4f5 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -174,7 +174,7 @@ func RunnersDir() (p string) { defer func() { if p == "" { - slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'") + slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama/runners'") } }() @@ -190,7 +190,7 @@ func RunnersDir() (p string) { } var paths []string - for _, root := range []string{filepath.Dir(exe), cwd} { + for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), ".."), cwd} { paths = append(paths, root, filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH), @@ -200,7 +200,7 @@ func RunnersDir() (p string) { // Try a few variations to improve developer experience when building from source in the local tree for _, path := range paths { - candidate := filepath.Join(path, "ollama_runners") + candidate := filepath.Join(path, "lib", "ollama", "runners") if _, err := os.Stat(candidate); err == nil { p = candidate break diff --git a/gpu/amd_common.go b/gpu/amd_common.go index 05747208..72d204f7 100644 --- a/gpu/amd_common.go +++ b/gpu/amd_common.go @@ -54,7 +54,7 @@ func commonAMDValidateLibDir() (string, error) { // Installer payload location if we're running the installed binary exe, err := os.Executable() if err == nil { - rocmTargetDir := filepath.Join(filepath.Dir(exe), "ollama_libs") + rocmTargetDir := filepath.Join(filepath.Dir(exe), "..", "lib", "ollama") if rocmLibUsable(rocmTargetDir) { slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir) return rocmTargetDir, nil diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go index 5d25a966..a0ae7c96 100644 --- a/gpu/amd_windows.go +++ b/gpu/amd_windows.go @@ -153,7 +153,7 @@ func AMDValidateLibDir() (string, error) { // Installer payload (if we're running from some other location) localAppData := os.Getenv("LOCALAPPDATA") appDir := filepath.Join(localAppData, "Programs", "Ollama") - rocmTargetDir := filepath.Join(appDir, "ollama_libs") + rocmTargetDir := filepath.Join(appDir, "..", "lib", "ollama") if rocmLibUsable(rocmTargetDir) { slog.Debug("detected ollama installed ROCm at " + rocmTargetDir) return rocmTargetDir, nil diff --git a/gpu/gpu.go b/gpu/gpu.go index eb87807a..391c98a8 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -653,8 +653,8 @@ func GetDepDir() string { slog.Warn("failed to lookup working directory", "error", err) } // Scan for any of our dependeices, and pick first match - for _, root := range []string{filepath.Dir(appExe), cwd} { - libDep := "ollama_libs" + for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), ".."), cwd} { + libDep := filepath.Join("lib", "ollama") if _, err := os.Stat(filepath.Join(root, libDep)); err == nil { return filepath.Join(root, libDep) } diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index dc9dda5a..aef03f9a 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -189,7 +189,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off" BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}" export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" - CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/ollama_libs}" + CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}" build install echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}" @@ -213,7 +213,7 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then CC=icx CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF" BUILD_DIR="../build/linux/${ARCH}/oneapi" - ONEAPI_DIST_DIR="${DIST_BASE}/ollama_libs" + ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama" export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb" DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it build @@ -260,7 +260,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then echo "Building custom ROCM GPU" fi BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}" - ROCM_DIST_DIR="${DIST_BASE}/ollama_libs" + ROCM_DIST_DIR="${DIST_BASE}/lib/ollama" # TODO figure out how to disable runpath (rpath) # export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu" diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index 42708d3e..4d43c9e2 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -35,7 +35,7 @@ function init_vars { ) $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on") $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower() - $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_runners" + $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\runners" md "$script:DIST_BASE" -ea 0 > $null if ($env:CGO_CFLAGS -contains "-g") { $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo") @@ -286,11 +286,11 @@ function build_cuda() { sign install - md "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" -ea 0 > $null - write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null + write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" } else { write-host "Skipping CUDA generation step" } @@ -324,17 +324,17 @@ function build_oneapi() { sign install - md "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" -ea 0 > $null - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" } else { Write-Host "Skipping oneAPI generation step" } @@ -384,11 +384,11 @@ function build_rocm() { sign install - md "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\rocblas\library\" -ea 0 > $null - cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\" -ea 0 > $null + cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" # amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs - cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\rocblas\library\" + cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\" } else { write-host "Skipping ROCm generation step" } diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1 index 50b60230..9cebf1f4 100644 --- a/scripts/build_windows.ps1 +++ b/scripts/build_windows.ps1 @@ -122,8 +122,8 @@ function buildOllama() { /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} } - New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force - cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\ + New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\bin\ -Force + cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\bin\ } function buildApp() { @@ -142,22 +142,22 @@ function buildApp() { function gatherDependencies() { write-host "Gathering runtime dependencies" cd "${script:SRC_DIR}" - md "${script:DEPS_DIR}\ollama_libs" -ea 0 > $null + md "${script:DEPS_DIR}\lib\ollama" -ea 0 > $null # TODO - this varies based on host build system and MSVC version - drive from dumpbin output # currently works for Win11 + MSVC 2019 + Cuda V11 - cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\ollama_libs\" - cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_libs\" - cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_libs\" + cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\lib\ollama\" + cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\lib\ollama\" + cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\lib\ollama\" foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) { - cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\ollama_libs\" + cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\lib\ollama\" } cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\" if ("${env:KEY_CONTAINER}") { write-host "about to sign" - foreach ($file in (get-childitem "${script:DEPS_DIR}\ollama_libs\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){ + foreach ($file in (get-childitem "${script:DEPS_DIR}\lib\ollama\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){ write-host "signing $file" & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" ` /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file diff --git a/scripts/install.sh b/scripts/install.sh index f0439b00..a02a0675 100644 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -66,7 +66,7 @@ fi for BINDIR in /usr/local/bin /usr/bin /bin; do echo $PATH | grep -q $BINDIR && break || continue done -OLLAMA_INSTALL_DIR=${OLLAMA_INSTALL_DIR:-${BINDIR}} +OLLAMA_INSTALL_DIR=$(dirname ${BINDIR}) status "Installing ollama to $OLLAMA_INSTALL_DIR" $SUDO install -o0 -g0 -m755 -d $BINDIR @@ -77,18 +77,22 @@ if curl -I --silent --fail --location "https://ollama.com/download/ollama-linux- "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" | \ $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR" BUNDLE=1 + if [ "$OLLAMA_INSTALL_DIR/bin/ollama" != "$BINDIR/ollama" ] ; then + status "Making ollama accessible in the PATH in $BINDIR" + $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama" + fi else status "Downloading Linux ${ARCH} CLI" curl --fail --show-error --location --progress-bar -o "$TEMP_DIR/ollama"\ "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}" $SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $OLLAMA_INSTALL_DIR/ollama BUNDLE=0 + if [ "$OLLAMA_INSTALL_DIR/ollama" != "$BINDIR/ollama" ] ; then + status "Making ollama accessible in the PATH in $BINDIR" + $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama" + fi fi -if [ "$OLLAMA_INSTALL_DIR/ollama" != "$BINDIR/ollama" ] ; then - status "Making ollama accessible in the PATH in $BINDIR" - $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama" -fi install_success() { status 'The Ollama API is now available at 127.0.0.1:11434.' From f9e31da9463092d7b3661594788c259d6d55b3d9 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 15 Aug 2024 14:38:14 -0700 Subject: [PATCH 62/71] Review comments --- .github/workflows/release.yaml | 106 ++++++--------------------------- docs/linux.md | 8 +-- gpu/cuda_common.go | 2 +- gpu/gpu.go | 16 ++--- llm/generate/gen_windows.ps1 | 4 +- 5 files changed, 32 insertions(+), 104 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 4bd68455..508fbb35 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -183,10 +183,17 @@ jobs: name: windows-rocm-deps path: dist/deps/* - # CUDA v11 generation step - generate-windows-cuda-v11: + # CUDA generation step + generate-windows-cuda: environment: release runs-on: windows + strategy: + matrix: + cuda: + - version: "11" + url: 'https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe' + - version: "12" + url: 'https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe' env: KEY_CONTAINER: ${{ vars.KEY_CONTAINER }} steps: @@ -220,11 +227,11 @@ jobs: with: go-version-file: go.mod cache: true - - name: 'Install CUDA' + - name: 'Install CUDA ${{ matrix.cuda.version }}' run: | $ErrorActionPreference = "Stop" write-host "downloading CUDA Installer" - Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe" + Invoke-WebRequest -Uri "${{ matrix.cuda.url }}" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe" write-host "Installing CUDA" Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait write-host "Completed CUDA" @@ -256,7 +263,7 @@ jobs: cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\" - uses: actions/upload-artifact@v4 with: - name: generate-windows-cuda-v11 + name: generate-windows-cuda-${{ matrix.cuda.version }} path: | llm/build/**/bin/* dist/windows-amd64/** @@ -265,95 +272,13 @@ jobs: name: windows-cuda-deps path: dist/deps/* - # CUDA v12 generation step - generate-windows-cuda-v12: - environment: release - runs-on: windows - env: - KEY_CONTAINER: ${{ vars.KEY_CONTAINER }} - steps: - - uses: actions/checkout@v4 - - name: Set Version - shell: bash - run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV - - uses: 'google-github-actions/auth@v2' - with: - project_id: 'ollama' - credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}' - - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt - - name: install Windows SDK 8.1 to get signtool - run: | - $ErrorActionPreference = "Stop" - write-host "downloading SDK" - Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe" - Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait - write-host "Win SDK 8.1 installed" - gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe' - - name: install signing plugin - run: | - $ErrorActionPreference = "Stop" - write-host "downloading plugin" - Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip" - Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\ - write-host "Installing plugin" - & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet - write-host "plugin installed" - - uses: actions/setup-go@v5 - with: - go-version-file: go.mod - cache: true - - name: 'Install CUDA' - run: | - $ErrorActionPreference = "Stop" - write-host "downloading CUDA Installer" - Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe" - write-host "Installing CUDA" - Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait - write-host "Completed CUDA" - $cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path) - $cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2' - echo "$cudaPath\bin" >> $env:GITHUB_PATH - echo "CUDA_PATH=$cudaPath" >> $env:GITHUB_ENV - echo "CUDA_PATH_V${cudaVer}=$cudaPath" >> $env:GITHUB_ENV - echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV - - name: 'Verify CUDA' - run: nvcc -V - - run: go get ./... - - name: go generate - run: | - $gopath=(get-command go).source | split-path -parent - $cudabin=(get-command nvcc).source | split-path - & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1" - cd $env:GITHUB_WORKSPACE - $env:CMAKE_SYSTEM_VERSION="10.0.22621.0" - $env:PATH="$gopath;$cudabin;$env:PATH" - $env:OLLAMA_SKIP_CPU_GENERATE="1" - go generate -x ./... - - name: 'gather cuda dependencies' - run: | - $NVIDIA_DIR=(resolve-path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*\bin\')[0] - md "dist\deps" - cp "${NVIDIA_DIR}\cudart64_*.dll" "dist\deps\" - cp "${NVIDIA_DIR}\cublas64_*.dll" "dist\deps\" - cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\" - - uses: actions/upload-artifact@v4 - with: - name: generate-windows-cuda-v12 - path: | - llm/build/**/bin/* - dist/windows-amd64/** - - uses: actions/upload-artifact@v4 - with: - name: windows-cuda-deps - path: dist/deps/* # Import the prior generation steps and build the final windows assets build-windows: environment: release runs-on: windows needs: - - generate-windows-cuda-v11 - - generate-windows-cuda-v12 + - generate-windows-cuda - generate-windows-rocm - generate-windows-cpu env: @@ -397,7 +322,10 @@ jobs: name: generate-windows-cpu - uses: actions/download-artifact@v4 with: - name: generate-windows-cuda-v11 + name: generate-windows-cuda-11 + - uses: actions/download-artifact@v4 + with: + name: generate-windows-cuda-12 - uses: actions/download-artifact@v4 with: name: windows-cuda-deps diff --git a/docs/linux.md b/docs/linux.md index 3ed2bed0..d1d5892c 100644 --- a/docs/linux.md +++ b/docs/linux.md @@ -20,12 +20,12 @@ GPU. ## Manual install -### Download the `ollama` tar file +### Download `ollama` -Ollama is distributed as a tar file including GPU library dependencies. +Download and extract the Linux package: ```bash -curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar -C /usr -zxf - +curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr ``` ### Adding Ollama as a startup service (recommended) @@ -95,7 +95,7 @@ curl -fsSL https://ollama.com/install.sh | sh Or by downloading the ollama binary: ```bash -curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar -C /usr -zxf - +curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr ``` ## Installing specific versions diff --git a/gpu/cuda_common.go b/gpu/cuda_common.go index defaa60a..827cc9b4 100644 --- a/gpu/cuda_common.go +++ b/gpu/cuda_common.go @@ -28,7 +28,7 @@ func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) { return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",") } -func cudaGetVariant(gpuInfo CudaGPUInfo) string { +func cudaVariant(gpuInfo CudaGPUInfo) string { if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" { if CudaTegra != "" { ver := strings.Split(CudaTegra, ".") diff --git a/gpu/gpu.go b/gpu/gpu.go index 391c98a8..72d237a6 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -225,7 +225,7 @@ func GetGPUInfo() GpuInfoList { return GpuInfoList{cpus[0].GpuInfo} } - depPath := GetDepDir() + depPath := LibraryDir() // Load ALL libraries cHandles = initCudaHandles() @@ -264,20 +264,20 @@ func GetGPUInfo() GpuInfoList { gpuInfo.computeMajor = int(memInfo.major) gpuInfo.computeMinor = int(memInfo.minor) gpuInfo.MinimumMemory = cudaMinimumMemory - cudaVariant := cudaGetVariant(gpuInfo) + variant := cudaVariant(gpuInfo) if depPath != "" { gpuInfo.DependencyPath = depPath // Check for variant specific directory - if cudaVariant != "" { - if _, err := os.Stat(filepath.Join(depPath, "cuda_"+cudaVariant)); err == nil { - gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+cudaVariant) + if variant != "" { + if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil { + gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+variant) } } } gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) gpuInfo.DriverMajor = driverMajor gpuInfo.DriverMinor = driverMinor - gpuInfo.Variant = cudaGetVariant(gpuInfo) + gpuInfo.Variant = variant // query the management library as well so we can record any skew between the two // which represents overhead on the GPU we must set aside on subsequent updates @@ -468,7 +468,7 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string { slog.Debug("Searching for GPU library", "name", baseLibName) // Start with our bundled libraries - patterns := []string{filepath.Join(GetDepDir(), baseLibName)} + patterns := []string{filepath.Join(LibraryDir(), baseLibName)} switch runtime.GOOS { case "windows": @@ -642,7 +642,7 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) { } } -func GetDepDir() string { +func LibraryDir() string { // On Windows/linux we bundle the dependencies at the same level as the executable appExe, err := os.Executable() if err != nil { diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index 4d43c9e2..cbdfd09f 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -117,7 +117,7 @@ function build { if ($cmakeDefs -contains "-G") { $extra=@("-j8") } else { - $extra= @("--", "/p:CL_MPcount=8") + $extra= @("--", "/maxCpuCount:8") } write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra" & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra @@ -273,7 +273,7 @@ function build_cuda() { "-DGGML_CUDA=ON", "-DGGML_AVX=on", "-DGGML_AVX2=off", - "-DCMAKE_CUDA_FLAGS=-t8", + "-DCMAKE_CUDA_FLAGS=-t6", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}", "-DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=$env:CUDA_PATH" ) From d8be22e47d460d1483846e2effb9b67fbfce1c0b Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 19 Aug 2024 12:07:18 -0700 Subject: [PATCH 63/71] Fix overlapping artifact name on CI --- .github/workflows/release.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 508fbb35..f6489dac 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -269,7 +269,7 @@ jobs: dist/windows-amd64/** - uses: actions/upload-artifact@v4 with: - name: windows-cuda-deps + name: windows-cuda-deps-${{ matrix.cuda.version }} path: dist/deps/* @@ -328,7 +328,10 @@ jobs: name: generate-windows-cuda-12 - uses: actions/download-artifact@v4 with: - name: windows-cuda-deps + name: windows-cuda-deps-11 + - uses: actions/download-artifact@v4 + with: + name: windows-cuda-deps-12 - uses: actions/download-artifact@v4 with: name: windows-rocm-deps From f91c9e370923d3b10a88732ab577e2728022152d Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 19 Aug 2024 13:48:45 -0700 Subject: [PATCH 64/71] CI: handle directories during checksum (#6427) --- .github/workflows/release.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index f6489dac..aad49d98 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -472,7 +472,8 @@ jobs: merge-multiple: true - run: | ls -lh dist/ - (cd dist; sha256sum * > sha256sum.txt) + (cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt) + mv sha256sum.txt dist/ cat dist/sha256sum.txt - name: Create or update Release run: | From 19e5a890f70b95a55c9de6a55357d78fc0a4ff81 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 19 Aug 2024 15:19:21 -0700 Subject: [PATCH 65/71] CI: remove directories from dist dir before upload step (#6429) --- .github/workflows/release.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index aad49d98..2cf4d2c2 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -474,6 +474,7 @@ jobs: ls -lh dist/ (cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt) mv sha256sum.txt dist/ + mv dist/linux-???64 . cat dist/sha256sum.txt - name: Create or update Release run: | From a017cf2fea4aaa376087520382058c42cffce097 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 20 Aug 2024 07:26:38 -0700 Subject: [PATCH 66/71] Split rocm back out of bundle (#6432) We're over budget for github's maximum release artifact size with rocm + 2 cuda versions. This splits rocm back out as a discrete artifact, but keeps the layout so it can be extracted into the same location as the main bundle. --- .github/workflows/release.yaml | 1 + Dockerfile | 4 ++-- llm/generate/gen_linux.sh | 3 ++- scripts/build_linux.sh | 6 ++++++ scripts/install.sh | 5 +++++ 5 files changed, 16 insertions(+), 3 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 2cf4d2c2..9c1e3e13 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -475,6 +475,7 @@ jobs: (cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt) mv sha256sum.txt dist/ mv dist/linux-???64 . + mv dist/linux-amd64-rocm . cat dist/sha256sum.txt - name: Create or update Release run: | diff --git a/Dockerfile b/Dockerfile index d4b86918..c46477b4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -95,8 +95,8 @@ ARG AMDGPU_TARGETS ENV GOARCH amd64 RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh -RUN mkdir -p ../../dist/linux-amd64/lib/ollama && \ - (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64/lib/ollama && tar xf - ) +RUN mkdir -p ../../dist/linux-amd64-rocm/lib/ollama && \ + (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64-rocm/lib/ollama && tar xf - ) FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64 ARG CMAKE_VERSION diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index aef03f9a..6927dda8 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -260,7 +260,8 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then echo "Building custom ROCM GPU" fi BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}" - ROCM_DIST_DIR="${DIST_BASE}/lib/ollama" + # ROCm dependencies are too large to fit into a unified bundle + ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama" # TODO figure out how to disable runpath (rpath) # export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu" diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh index adda2ad7..6cb0d0cd 100755 --- a/scripts/build_linux.sh +++ b/scripts/build_linux.sh @@ -24,8 +24,14 @@ for TARGETARCH in ${BUILD_ARCH}; do docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH rm -rf ./dist/linux-$TARGETARCH docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist + if echo ${TARGETARCH} | grep "amd64" > /dev/null; then + docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH-rocm ./dist + fi docker rm builder-$TARGETARCH echo "Compressing final linux bundle..." rm -f ./dist/ollama-linux-$TARGETARCH.tgz (cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz ) + if [ -d dist/linux-$TARGETARCH-rocm ]; then + (cd dist/linux-$TARGETARCH-rocm && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH-rocm.tgz ) + fi done diff --git a/scripts/install.sh b/scripts/install.sh index a02a0675..25f57565 100644 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -199,6 +199,11 @@ fi if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then if [ $BUNDLE -ne 0 ]; then + status "Downloading Linux ROCm ${ARCH} bundle" + curl --fail --show-error --location --progress-bar \ + "https://ollama.com/download/ollama-linux-${ARCH}-rocm.tgz${VER_PARAM}" | \ + $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR" + install_success status "AMD GPU ready." exit 0 From 5a28b9cf5fcb3994aa1a143118c73c7d1fbf3bf9 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 6 Jun 2024 08:59:04 -0700 Subject: [PATCH 67/71] bert --- convert/convert.go | 12 ++ convert/convert_bert.go | 176 +++++++++++++++++++++++++ convert/convert_test.go | 1 + convert/reader.go | 2 + convert/testdata/all-MiniLM-L6-v2.json | 124 +++++++++++++++++ convert/tokenizer.go | 31 ++--- 6 files changed, 331 insertions(+), 15 deletions(-) create mode 100644 convert/convert_bert.go create mode 100644 convert/testdata/all-MiniLM-L6-v2.json diff --git a/convert/convert.go b/convert/convert.go index 24c19aa4..f51e9665 100644 --- a/convert/convert.go +++ b/convert/convert.go @@ -66,6 +66,10 @@ type Converter interface { writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error } +type moreParser interface { + parseMore(fs.FS) error +} + // Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations // and files it finds in the input path. // Supported input model formats include safetensors. @@ -95,6 +99,8 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error { conv = &gemma{} case "Phi3ForCausalLM": conv = &phi3{} + case "BertModel": + conv = &bert{} default: return errors.New("unsupported architecture") } @@ -103,6 +109,12 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error { return err } + if t, ok := conv.(moreParser); ok { + if err := t.parseMore(fsys); err != nil { + return err + } + } + t, err := parseTokenizer(fsys, conv.specialTokenTypes()) if err != nil { return err diff --git a/convert/convert_bert.go b/convert/convert_bert.go new file mode 100644 index 00000000..62fad147 --- /dev/null +++ b/convert/convert_bert.go @@ -0,0 +1,176 @@ +package convert + +import ( + "cmp" + "encoding/json" + "io/fs" + "path/filepath" + "slices" + "strings" + + "github.com/ollama/ollama/llm" +) + +type bert struct { + Parameters + NLayers uint32 `json:"n_layers"` + NumHiddenLayers uint32 `json:"num_hidden_layers"` + NLayer uint32 `json:"n_layer"` + MaxPositionEmbeddings uint32 `json:"max_position_embeddings"` + NCtx uint32 `json:"n_ctx"` + HiddenSize uint32 `json:"hidden_size"` + NEmbd uint32 `json:"n_embd"` + IntermediateSize uint32 `json:"intermediate_size"` + NInner uint32 `json:"n_inner"` + NumAttentionHeads uint32 `json:"num_attention_heads"` + NHead uint32 `json:"n_head"` + NumKeyValueHeads uint32 `json:"num_key_value_heads"` + LayerNormEPS float32 `json:"layer_norm_eps"` + LayerNormEpsilon float32 `json:"layer_norm_epsilon"` + NormEpsilon float32 `json:"norm_epsilon"` + + PoolingType uint32 +} + +var ( + _ Converter = (*bert)(nil) + _ moreParser = (*bert)(nil) +) + +func (p *bert) parseMore(fsys fs.FS) error { + bts, err := fs.ReadFile(fsys, "modules.json") + if err != nil { + return err + } + + var modules []struct { + Type string `json:"type"` + Path string `json:"path"` + } + + if err := json.Unmarshal(bts, &modules); err != nil { + return err + } + + var pooling string + for _, m := range modules { + if m.Type == "sentence_transformers.models.Pooling" { + pooling = m.Path + break + } + } + + if pooling != "" { + bts, err := fs.ReadFile(fsys, filepath.Join(pooling, "config.json")) + if err != nil { + return err + } + + var pc struct { + PoolingModeCLSToken bool `json:"pooling_mode_cls_token"` + PoolingModeMeanTokens bool `json:"pooling_mode_mean_tokens"` + } + + if err := json.Unmarshal(bts, &pc); err != nil { + return err + } + + if pc.PoolingModeMeanTokens { + p.PoolingType = 1 + } else if pc.PoolingModeCLSToken { + p.PoolingType = 2 + } + } + + return nil +} + +func (p *bert) KV(t *Tokenizer) llm.KV { + kv := p.Parameters.KV(t) + kv["general.architecture"] = "bert" + kv["general.name"] = "bert" + kv["bert.attention.causal"] = false + kv["bert.pooling_type"] = p.PoolingType + + kv["bert.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer) + + if contextLength := cmp.Or(p.MaxPositionEmbeddings, p.NCtx); contextLength > 0 { + kv["bert.context_length"] = contextLength + } + + if embeddingLength := cmp.Or(p.HiddenSize, p.NEmbd); embeddingLength > 0 { + kv["bert.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd) + } + + if feedForwardLength := cmp.Or(p.IntermediateSize, p.NInner); feedForwardLength > 0 { + kv["bert.feed_forward_length"] = cmp.Or(p.IntermediateSize, p.NInner) + } + + if headCount := cmp.Or(p.NumAttentionHeads, p.NHead); headCount > 0 { + kv["bert.attention.head_count"] = cmp.Or(p.NumAttentionHeads, p.NHead) + } + + if layerNormEpsilon := cmp.Or(p.LayerNormEPS, p.LayerNormEpsilon, p.NormEpsilon); layerNormEpsilon > 0 { + kv["bert.attention.layer_norm_epsilon"] = layerNormEpsilon + } + + kv["tokenizer.ggml.model"] = "bert" + kv["tokenizer.ggml.token_type_count"] = uint32(2) + + // convert to phantom space tokens + for i, e := range t.Tokens { + if strings.HasPrefix(e, "[") && strings.HasSuffix(e, "]") { + // noop + } else if strings.HasPrefix(e, "##") { + t.Tokens[i] = e[2:] + } else { + t.Tokens[i] = "\u2581" + e + } + } + + kv["tokenizer.ggml.tokens"] = t.Tokens + + return kv +} + +func (p *bert) Tensors(ts []Tensor) []llm.Tensor { + var out []llm.Tensor + for _, t := range ts { + if slices.Contains([]string{ + "embeddings.position_ids", + "pooler.dense.weight", + "pooler.dense.bias", + }, t.Name()) { + continue + } + + name := p.tensorName(t.Name()) + out = append(out, llm.Tensor{ + Name: name, + Kind: t.Kind(), + Shape: t.Shape(), + WriterTo: t, + }) + } + + return out +} + +func (bert) tensorName(n string) string { + return strings.NewReplacer( + "encoder.layer", "blk", + "encoder.layers", "blk", + "embeddings.word_embeddings", "token_embd", + "embeddings.token_type_embeddings", "token_types", + "embeddings.LayerNorm", "token_embd_norm", + "embeddings.position_embeddings", "position_embd", + "attention.self.query", "attn_q", + "attention.self.key", "attn_k", + "attention.self.value", "attn_v", + "attention.output.dense", "attn_output", + "attention.output.LayerNorm", "attn_output_norm", + "intermediate.dense", "ffn_up", + "output.dense", "ffn_down", + "output.LayerNorm", "layer_output_norm", + ).Replace(n) +} diff --git a/convert/convert_test.go b/convert/convert_test.go index cb2c585e..e3ab0098 100644 --- a/convert/convert_test.go +++ b/convert/convert_test.go @@ -67,6 +67,7 @@ func TestConvertFull(t *testing.T) { "gemma-2b-it", // microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8 "Phi-3-mini-128k-instruct", + "all-MiniLM-L6-v2", } for i := range cases { diff --git a/convert/reader.go b/convert/reader.go index ce95208e..294a7c40 100644 --- a/convert/reader.go +++ b/convert/reader.go @@ -37,6 +37,8 @@ const ( func (t tensorBase) Kind() uint32 { if strings.HasSuffix(t.name, ".block_sparse_moe.gate.weight") { return 0 + } else if t.name == "embeddings.token_type_embeddings.weight" { + return 0 } switch len(t.shape) { diff --git a/convert/testdata/all-MiniLM-L6-v2.json b/convert/testdata/all-MiniLM-L6-v2.json new file mode 100644 index 00000000..15c8f039 --- /dev/null +++ b/convert/testdata/all-MiniLM-L6-v2.json @@ -0,0 +1,124 @@ +{ + "general.architecture": "bert", + "general.file_type": "1", + "general.quantization_version": "2", + "bert.attention.causal": "false", + "bert.attention.head_count": "12", + "bert.attention.layer_norm_epsilon": "1e-12", + "bert.block_count": "6", + "bert.context_length": "512", + "bert.embedding_length": "384", + "bert.feed_forward_length": "1536", + "bert.pooling_type": "1", + "tokenizer.ggml.model": "bert", + "tokenizer.ggml.padding_token_id": "0", + "tokenizer.ggml.unknown_token_id": "100", + "tokenizer.ggml.cls_token_id": "101", + "tokenizer.ggml.seperator_token_id": "102", + "tokenizer.ggml.mask_token_id": "103", + "tokenizer.ggml.token_type_count": "2", + "tokenizer.ggml.scores": "6db964fe67338aca57790481a390121ff3dd643eebe49f7dd308029ad99abb6f", + "tokenizer.ggml.token_type": "98d247c5404b6b18f05f133b92dd56edf6efefefac326794b00d7b351f6c5aa1", + "tokenizer.ggml.tokens": "9efe405e229a45ff9916f54c475d151d2200cd2ab0006f347abfb069cf096c86", + "token_embd.weight": "8c1ee80a9ea4f65aa385ba30112010068af3d209bebc6e149d3d4589c2cd0a5a", + "position_embd.weight": "6c516f0b1c4e2388ab90394dd80ad69e4e4509b890982fc3408108ae66210eb6", + "token_types.weight": "f879f8e422ed211948f28b560d3c5e17aae7993f063b51196a28cf5c0fb3da21", + "token_embd_norm.weight": "75076e095d717aab96f8b6beeee503c27940d9a76f2b891a0e3de72f8a6043e4", + "token_embd_norm.bias": "298735285ffe944e1bf03e5d35c7280326b85cf121bde9874f1af5dc51ab939d", + "blk.0.attn_q.weight": "ab0923ce4c1549175112dcdfcc860fe30137f991e03ea6857fb5993670adaf6c", + "blk.0.attn_q.bias": "a3ec29551dabf976e1d34256b8ab5ab7b758f3ed9742c3cafdbd984d5441df62", + "blk.0.attn_k.weight": "4c1038a6d035c3e9ffed7fa672b614627814752503755fbad0cfb76a41ad71ba", + "blk.0.attn_k.bias": "e0363930eb588d91816aa3d230bb03b6e2551c165117b80b8d60397413819ef9", + "blk.0.attn_v.weight": "425e2e53e3f00ce98d29c3e6a161eb55d3e6ae0d96fdb9f6242d1c4fd6eef4b3", + "blk.0.attn_v.bias": "6579173a1e65ee124fbd0bd53cbdca4225515b4f2c5f18fb1bfd000f5978f9bb", + "blk.0.attn_output.weight": "a6d70a08cd7164de5d12af65d86d657c3db35aaecde778b2b3fda9193c4c9802", + "blk.0.attn_output.bias": "2b8d12c4f9a9c5bfaa29c597839568f6e0525cb41eeaf64ddeb6bd84dfeb9701", + "blk.0.attn_output_norm.weight": "bbe6e502a473228b525aeed26cc31b7db123ad63bdc5a6eebac6ea70b8b51d62", + "blk.0.attn_output_norm.bias": "36eaacaf0007c5c62daea97aab0115390c0682914f78482e37eb76885f4b7a50", + "blk.0.ffn_up.weight": "24654561c76ce387d125759ba843f06b904ef721fcceaeff6ccc62180a48e874", + "blk.0.ffn_up.bias": "fd3f0126aa1d95768fa60eb6f4ab8a2763cfcb7e5405f35b92353031d86f4d34", + "blk.0.ffn_down.weight": "97a829763a6a5bf3329ceb4d39c424ba4787d61653a5b0bbd1f84782e4d4e0ca", + "blk.0.ffn_down.bias": "7aa980c30ae8b4ee7f69df28808dbf5c431f56ccc4a80340f644a0419f16c054", + "blk.0.layer_output_norm.weight": "ef30dad4c2a083ae1ff5039a2a6cda60ecc89bf1e486a6f8c0d15f50589603f8", + "blk.0.layer_output_norm.bias": "8b1b77e67568b1bce43fc476de1b177c53ff688d66beb66995e8eb3dc290da8a", + "blk.1.attn_q.weight": "284331622a1f6f9b87ccee4f652bd66a394ca493c4d93be4d1844e4f6159ad10", + "blk.1.attn_q.bias": "e24ebd4860330e08f6bfdd077a82db0bee33f4c8846cf1db26327a34754c7069", + "blk.1.attn_k.weight": "729dd0d555544b5bd0f7580b3c8b384256b974605f0e7487b95f295aa032997d", + "blk.1.attn_k.bias": "2aa51a828a858f35473f54477583fea54ce2ccc34ea60fbd1d228fbe9bca827f", + "blk.1.attn_v.weight": "6be304671cc311d5ca5c103f2b51467ee800c589bc5b8101e09ff5aed1f68c21", + "blk.1.attn_v.bias": "43bcbab78a8819e07f723bc9e5b737b71e87a7594f15234e882b63e327a64199", + "blk.1.attn_output.weight": "15ec8a1a12b26c9976445308a09f748ab0e4bef0f583d13ab08c3129f8738d73", + "blk.1.attn_output.bias": "dac2146f4baa6ed16f6c0dc7443831fb7ec79bedcceafd80d1a4b628a1bb072d", + "blk.1.attn_output_norm.weight": "d2151eb33bffac536787a4c9a5d2b31c7a80b17c4611877842a3cce2cd6e98d8", + "blk.1.attn_output_norm.bias": "31e1b779716dafb855d2cf5631ee168a0ccf372eb9c6ea6091f66fa97a9b9d2d", + "blk.1.ffn_up.weight": "a57547fc3fc3b77406f5cdcb0c87af9bc184701f175c39c1f35297826fce3cc7", + "blk.1.ffn_up.bias": "123be6d541d086202913c75d878c54d59a749f3af7b58f7ef9eb9e7c62a24c9a", + "blk.1.ffn_down.weight": "cfdb79788377e5cbded8790cd41b9e66c397ecab75474071fcd7cf32d30f9613", + "blk.1.ffn_down.bias": "bcb58315519a573097960891c9ae41cf4c685ab78c3e0e77471471758a7eae88", + "blk.1.layer_output_norm.weight": "819b554271452bfb1d84c2603b90377b2e41a0ac1e3aa8b417ccf9dce63375bd", + "blk.1.layer_output_norm.bias": "47a3433ac27f5ce8947fb38dd491f3706df4ef6adb0ddf74612bf0f54b19e164", + "blk.2.attn_q.weight": "1557a9ea852b1880551f7290e00aded4f35e6c4180fdcbed1b0039bf805f639e", + "blk.2.attn_q.bias": "c3bfe5f3066f655fd36b055530997b59ff33ef013563aaeb3cb8ff07dabd59a9", + "blk.2.attn_k.weight": "cfd08eb69c61ae2f9f14f9b7ff5c5394ca264b1a9f3d48156677f90dd1766289", + "blk.2.attn_k.bias": "9b839bc0e79974a0b3f5d1895972bc6f5c9a1bc16052e1af786e6a530758152d", + "blk.2.attn_v.weight": "02b26b1208480eaeeb00e7b4cf8b690006ca14759357fc44ed4a2a8924ead993", + "blk.2.attn_v.bias": "e7e6f0089fded1659a867ab736c220d9653ea7da6b1b94baf5c8d30a748b63ab", + "blk.2.attn_output.weight": "a1db121c7d33806b349cadd050300a57db49fdc91224fd07c9ac43bf4299dc79", + "blk.2.attn_output.bias": "7675128b6a92555cd955c820311e91e9417d31f48848f45d047b4100c62148b3", + "blk.2.attn_output_norm.weight": "5b4595e0fbcba67a700c4331adf746d2fba3546364a4db5607ae241947bb1a21", + "blk.2.attn_output_norm.bias": "7b8e16826ea30e5a2ba0b02e0095a901775981a296e98819625320e983060d08", + "blk.2.ffn_up.weight": "a0d815d946ac07a65095c4ae4df77b818845e6d97795c7d82f55e689d944db59", + "blk.2.ffn_up.bias": "ce37c0a4174d6bf773ded7bd016ede627ad3bdb8bc99b9992a18dc8e8898f252", + "blk.2.ffn_down.weight": "f6231d2a25426fbd45b9f1160aa484220eb227ceef0348c4a6a6de890606e5ef", + "blk.2.ffn_down.bias": "429e00556e8dc63a785238b309b9d83738500c1ef6d736fe6526ad88ea496d27", + "blk.2.layer_output_norm.weight": "651457a573adf3f7dd9ee5dfe1c8e89389e94443993aab77ec6a0b05aa621e35", + "blk.2.layer_output_norm.bias": "41fbbeda7fd89b0cef5f945ae44011c316982390401d6f75ba8c6d365e185247", + "blk.3.attn_q.weight": "95a43f32949d2cb8d22815bb27a44abfc6665ba96221af817dfe058cb6ca72c6", + "blk.3.attn_q.bias": "f4e34385e75d8108b6b3bd336106e2133a8c9be0cc343dfe5dc48c32a823c7cb", + "blk.3.attn_k.weight": "6b892da6a17d4d3265265a15f695864a31813ee8c8e710ae9bc9e1adbc6c9a18", + "blk.3.attn_k.bias": "40b8067b641a56014cee42548240aa8930820958b1933004892b5f04fbaef39e", + "blk.3.attn_v.weight": "9fcd5922319dd2a461082a5ce040c1dfe65d87d70ca6547dd0b46eeecc3eeb2b", + "blk.3.attn_v.bias": "b528c56212e66931fdbe267ac327a9c2f87cd03baff3ea719e30afe681da15f1", + "blk.3.attn_output.weight": "e3b178c1b03981e75510e0d277af23ea59cc404b5394e61bd32291825719b502", + "blk.3.attn_output.bias": "712c84d39a6a5a9c06a09da8fd9939ba0d5525524a4bba61ea4de09b48f45cae", + "blk.3.attn_output_norm.weight": "d1ffac88e675592ff72f8a617be32b4a381d443b2f8f2645dbe44a1e5745aac0", + "blk.3.attn_output_norm.bias": "ea31a1c73146234c50e0e43f485c458413714867b8e2703af66482f7db2d6c40", + "blk.3.ffn_up.weight": "4ef4f3b9a1ea6ab2ef2eb6e8b008e06a44790d099d97482a05a51e39a29afac0", + "blk.3.ffn_up.bias": "06a4296dda16f452675c51f108079fe7722552d6521c737d97734943818b9a2b", + "blk.3.ffn_down.weight": "f114b2bebe392c7d80433bb880c6730293aa4561b0b0370dcdaf7472daebd847", + "blk.3.ffn_down.bias": "2c8e67831d28a3bf613fc7912ae3259b63d72abcaf4d30efd8800758400158de", + "blk.3.layer_output_norm.weight": "a1dfeb7b5a51dd56447312ca41e2ad2f361a3ea12ddc355127f5f4219fb0a482", + "blk.3.layer_output_norm.bias": "1ed630021b25c6c6fc93fd32988b9907df966d4982a93081f639aac3044618ab", + "blk.4.attn_q.weight": "b5fae4c1f9a5f33a2a2e816ac0c01c25f422e4efdd59ef1ed93da2610e5370fc", + "blk.4.attn_q.bias": "c2e376524ea98ac3b10d9eee19ecb1b1e261fa5149efe0232844c923dfb428fb", + "blk.4.attn_k.weight": "a4632f5ebf9321d9d08f9112a4e5dda2efe5671df4a4e67fee24845f5b14af16", + "blk.4.attn_k.bias": "a9a02ffb8b8b4f6dfe487a7e0341f1d5318c9d2b793a688f34cb1b22fc66ef60", + "blk.4.attn_v.weight": "10ad8deb81d9fa093b1e5c0f24ea82aa7df43e6aca49e260fcbea56eab8cc86a", + "blk.4.attn_v.bias": "7326813e181e021130bd33ac136293fcffccce2d1d8cb59041e5b13a8cceacf6", + "blk.4.attn_output.weight": "c92573088c7437c2b3cda51490e152c27fb19e5468df591eabba5a49d5398d44", + "blk.4.attn_output.bias": "14e10b419e5859af1eb685af5c330aee67048cd704dcead9217840c6f5393222", + "blk.4.attn_output_norm.weight": "02b6831c0e0fb0edbc579a92812a1dd972cb15d14fcd382d4427c5a7b300ac44", + "blk.4.attn_output_norm.bias": "7eed5cd503bb6bb6ceb1bc8b07cc077903a4f14fb8b9d6cdf39644815ecf1374", + "blk.4.ffn_up.weight": "8d0c91d62e74d6431321116a37cf3339e630bd50ba164d3304fc4fe8dd831223", + "blk.4.ffn_up.bias": "d325f07f73c005a273c484c7be8e7abb4d6e8a5c4fd093f5869133b97629d017", + "blk.4.ffn_down.weight": "7ba7bd81143f40537b84f938e403e19f30e4928625eb371de052b9025beb4d21", + "blk.4.ffn_down.bias": "2853d9c2a75288214a4bf4907dc19d04d01926f4913d302b1aa7bdbfcce0f7a1", + "blk.4.layer_output_norm.weight": "a4ed1885fa77b90fed5300c355ef0aa0c876a8c747151d9d790939d464d57d4f", + "blk.4.layer_output_norm.bias": "62142a81e813a9e636333b2b805d6bc3b17c5e7cd4b15adce1ada6bc9a32563c", + "blk.5.attn_q.weight": "afc1dff080a72c3daad01384b1448d476aaf789871017c8ff8e144788887995d", + "blk.5.attn_q.bias": "748a820371c1d4f872c84545b36358d239c35bf6c99e2812c237d88c3292763b", + "blk.5.attn_k.weight": "59e30c1ed8acd2cbb01de5f62e7804015b9ecf98ba157d98cab016344639eda5", + "blk.5.attn_k.bias": "f839520078f9e589496e982e86d0126c7aa14196047339abffcf49a696229f77", + "blk.5.attn_v.weight": "3e21fb874e21b90308e1f46af034a3c32d3eba1628d62ae5f2246d6af5818923", + "blk.5.attn_v.bias": "5cd4852bf95c1444d10d756750f6bf49f842c0b39e9953c7f408bb67c325ac8c", + "blk.5.attn_output.weight": "636ce6a7752895f204b9d01ba0aedd9a294f908b42f372c22a16d9dd590d7471", + "blk.5.attn_output.bias": "82d924d4b0d2b94f2bbff91619216d6967a3541ce9b1531a6a60457a67b5d219", + "blk.5.attn_output_norm.weight": "5e7bd0a8d3396080f3360d7c4700bf094a06216431bd014c4479eef72ecf4271", + "blk.5.attn_output_norm.bias": "66c6de5edda5466d029c6753780be81ccd4218bf8bc00680000e0f06856ab712", + "blk.5.ffn_up.weight": "5bbf6e7ea380e216e33f8bee06d25f2265359d3876a300e92bc6e41d48e33430", + "blk.5.ffn_up.bias": "9d795388bb36fb33ad3a37fea3ccb4937838e02800a608fb47d363cd06b47370", + "blk.5.ffn_down.weight": "2fd628974e7f075479dd227b46fbd48ae8d3ca34d735b36f391ac06410730368", + "blk.5.ffn_down.bias": "cd213ba9eaa75fa541648097fbe9c96e58077e6c3ad6ad2fb1f21f8350f44291", + "blk.5.layer_output_norm.weight": "159a9df41d15b7022d136f86a2a2631c4635f9816e957472217077b522bcf52a", + "blk.5.layer_output_norm.bias": "24c1f27ffd1eb4e5be7e3a2909943e6f0980635d761fa1efdd0c19645da23766" +} diff --git a/convert/tokenizer.go b/convert/tokenizer.go index 0d42a6d8..653df6d2 100644 --- a/convert/tokenizer.go +++ b/convert/tokenizer.go @@ -1,7 +1,6 @@ package convert import ( - "cmp" "crypto/sha256" "encoding/hex" "encoding/json" @@ -11,6 +10,8 @@ import ( "log/slog" "os" "slices" + + "golang.org/x/exp/maps" ) const ( @@ -184,32 +185,32 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) { return nil, err } - var tokens []token + tokens := make(map[int]token, len(t.Model.Vocab)) for k, v := range t.Model.Vocab { - tokens = append(tokens, token{ + tokens[v] = token{ ID: v, Content: k, - }) + } } - for _, t := range t.AddedTokens { - t.UserDefined = true - tokens = append(tokens, t) + for _, token := range t.AddedTokens { + token.UserDefined = true + tokens[token.ID] = token } - slices.SortFunc(tokens, func(i, j token) int { - return cmp.Compare(i.ID, j.ID) - }) + keys := maps.Keys(tokens) + slices.Sort(keys) v := Vocabulary{Model: "gpt2"} - for _, t := range tokens { - v.Tokens = append(v.Tokens, t.Content) - v.Scores = append(v.Scores, float32(t.ID)) + for _, k := range keys { + token := tokens[k] + v.Tokens = append(v.Tokens, token.Content) + v.Scores = append(v.Scores, float32(token.ID)) switch { - case t.Special: + case token.Special: v.Types = append(v.Types, tokenTypeControl) - case t.UserDefined: + case token.UserDefined: v.Types = append(v.Types, tokenTypeUserDefined) default: v.Types = append(v.Types, tokenTypeNormal) From beb49eef65acefc64a6ae0562ce58467e6974fde Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Fri, 7 Jun 2024 14:55:56 -0700 Subject: [PATCH 68/71] create bert models from cli --- cmd/cmd.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/cmd/cmd.go b/cmd/cmd.go index fd7246c8..a8a02605 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -223,6 +223,14 @@ func tempZipFiles(path string) (string, error) { } files = append(files, js...) + // bert models require a nested config.json + // TODO(mxyng): merge this with the glob above + js, err = glob(filepath.Join(path, "**/*.json"), "text/plain") + if err != nil { + return "", err + } + files = append(files, js...) + if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 { // add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob // tokenizer.model might be a unresolved git lfs reference; error if it is @@ -252,6 +260,11 @@ func tempZipFiles(path string) (string, error) { return "", err } + zfi.Name, err = filepath.Rel(path, file) + if err != nil { + return "", err + } + zf, err := zipfile.CreateHeader(zfi) if err != nil { return "", err From 3546bbd08c52df73eb6523b06b13f1b2dfeaa5fb Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Fri, 28 Jun 2024 13:27:05 -0700 Subject: [PATCH 69/71] convert gemma2 --- convert/convert.go | 11 ++++++-- convert/convert_bert.go | 9 +++--- convert/convert_gemma.go | 14 ++++----- convert/convert_gemma2.go | 44 +++++++++++++++++++++++++++++ convert/convert_llama.go | 19 ++++++------- convert/convert_mixtral.go | 9 ++++-- convert/convert_phi3.go | 11 ++++---- convert/convert_test.go | 1 + convert/reader.go | 12 ++++---- convert/reader_safetensors.go | 5 ++-- convert/reader_torch.go | 5 ++-- convert/testdata/gemma-2-9b-it.json | 6 ++++ convert/tokenizer_spm.go | 32 ++++++++++++++++++++- 13 files changed, 132 insertions(+), 46 deletions(-) create mode 100644 convert/convert_gemma2.go create mode 100644 convert/testdata/gemma-2-9b-it.json diff --git a/convert/convert.go b/convert/convert.go index f51e9665..5a314cdd 100644 --- a/convert/convert.go +++ b/convert/convert.go @@ -7,6 +7,7 @@ import ( "io" "io/fs" "log/slog" + "strings" "github.com/ollama/ollama/llm" ) @@ -58,11 +59,13 @@ type Converter interface { KV(*Tokenizer) llm.KV // Tensors maps input tensors to LLM tensors. Model specific modifications can be done here. Tensors([]Tensor) []llm.Tensor + // Replacements returns a list of string pairs to replace in tensor names. + // See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details + Replacements() []string - // tensorName returns the LLM tensor name for a specific input name - tensorName(string) string // specialTokenTypes returns any special token types the model uses specialTokenTypes() []string + // writeFile writes the model to the provided io.WriteSeeker writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error } @@ -97,6 +100,8 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error { conv = &mixtral{} case "GemmaForCausalLM": conv = &gemma{} + case "Gemma2ForCausalLM": + conv = &gemma2{} case "Phi3ForCausalLM": conv = &phi3{} case "BertModel": @@ -131,7 +136,7 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error { slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens)) } - ts, err := parseTensors(fsys) + ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...)) if err != nil { return err } diff --git a/convert/convert_bert.go b/convert/convert_bert.go index 62fad147..4547a705 100644 --- a/convert/convert_bert.go +++ b/convert/convert_bert.go @@ -144,9 +144,8 @@ func (p *bert) Tensors(ts []Tensor) []llm.Tensor { continue } - name := p.tensorName(t.Name()) out = append(out, llm.Tensor{ - Name: name, + Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), WriterTo: t, @@ -156,8 +155,8 @@ func (p *bert) Tensors(ts []Tensor) []llm.Tensor { return out } -func (bert) tensorName(n string) string { - return strings.NewReplacer( +func (bert) Replacements() []string { + return []string{ "encoder.layer", "blk", "encoder.layers", "blk", "embeddings.word_embeddings", "token_embd", @@ -172,5 +171,5 @@ func (bert) tensorName(n string) string { "intermediate.dense", "ffn_up", "output.dense", "ffn_down", "output.LayerNorm", "layer_output_norm", - ).Replace(n) + } } diff --git a/convert/convert_gemma.go b/convert/convert_gemma.go index 9213e157..333e4c83 100644 --- a/convert/convert_gemma.go +++ b/convert/convert_gemma.go @@ -44,15 +44,14 @@ func (p *gemma) KV(t *Tokenizer) llm.KV { } func (p *gemma) Tensors(ts []Tensor) []llm.Tensor { - var out []llm.Tensor + out := make([]llm.Tensor, 0, len(ts)) for _, t := range ts { - name := p.tensorName(t.Name()) - if strings.HasSuffix(name, "_norm.weight") { + if strings.HasSuffix(t.Name(), "_norm.weight") { t.SetRepacker(p.addOne) } out = append(out, llm.Tensor{ - Name: name, + Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), WriterTo: t, @@ -62,8 +61,8 @@ func (p *gemma) Tensors(ts []Tensor) []llm.Tensor { return out } -func (p *gemma) tensorName(n string) string { - return strings.NewReplacer( +func (p *gemma) Replacements() []string { + return []string{ "model.embed_tokens", "token_embd", "model.norm", "output_norm", "model.layers", "blk", @@ -76,8 +75,7 @@ func (p *gemma) tensorName(n string) string { "mlp.down_proj", "ffn_down", "mlp.up_proj", "ffn_up", "post_attention_layernorm", "ffn_norm", - "block_sparse_moe.gate", "ffn_inp", - ).Replace(n) + } } func (*gemma) addOne(_ string, data []float32, shape []uint64) ([]float32, error) { diff --git a/convert/convert_gemma2.go b/convert/convert_gemma2.go new file mode 100644 index 00000000..66be02d6 --- /dev/null +++ b/convert/convert_gemma2.go @@ -0,0 +1,44 @@ +package convert + +import ( + "github.com/ollama/ollama/llm" +) + +type gemma2 struct { + gemma + SlidingWindow uint32 `json:"sliding_window"` + AttentionLogitSoftcap float32 `json:"attn_logit_softcapping"` + FinalLogitSoftcap float32 `json:"final_logit_softcapping"` +} + +func (p *gemma2) KV(t *Tokenizer) llm.KV { + kv := p.Parameters.KV(t) + kv["general.architecture"] = "gemma2" + kv["general.name"] = "gemma2" + kv["gemma2.context_length"] = p.MaxPositionEmbeddings + kv["gemma2.embedding_length"] = p.HiddenSize + kv["gemma2.block_count"] = p.HiddenLayers + kv["gemma2.feed_forward_length"] = p.IntermediateSize + kv["gemma2.attention.head_count"] = p.NumAttentionHeads + kv["gemma2.attention.head_count_kv"] = p.NumKeyValueHeads + kv["gemma2.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS + kv["gemma2.attention.key_length"] = p.HeadDim + kv["gemma2.attention.value_length"] = p.HeadDim + kv["gemma2.attention.sliding_window"] = p.SlidingWindow + kv["gemma2.attn_logit_softcapping"] = p.AttentionLogitSoftcap + kv["gemma2.final_logit_softcapping"] = p.FinalLogitSoftcap + kv["tokenizer.ggml.eot_token_id"] = uint32(107) + kv["tokenizer.ggml.middle_token_id"] = uint32(68) + kv["tokenizer.ggml.prefix_token_id"] = uint32(67) + kv["tokenizer.ggml.suffix_token_id"] = uint32(69) + return kv +} + +func (p *gemma2) Replacements() []string { + return append( + p.gemma.Replacements(), + "post_attention_layernorm", "post_attention_norm", + "pre_feedforward_layernorm", "ffn_norm", + "post_feedforward_layernorm", "post_ffw_norm", + ) +} diff --git a/convert/convert_llama.go b/convert/convert_llama.go index 178b13f3..498d1321 100644 --- a/convert/convert_llama.go +++ b/convert/convert_llama.go @@ -96,14 +96,13 @@ func (p *llama) KV(t *Tokenizer) llm.KV { func (p *llama) Tensors(ts []Tensor) []llm.Tensor { var out []llm.Tensor for _, t := range ts { - name := p.tensorName(t.Name()) - if strings.HasSuffix(name, "attn_q.weight") || - strings.HasSuffix(name, "attn_k.weight") { + if strings.HasSuffix(t.Name(), "attn_q.weight") || + strings.HasSuffix(t.Name(), "attn_k.weight") { t.SetRepacker(p.repack) } out = append(out, llm.Tensor{ - Name: name, + Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), WriterTo: t, @@ -113,8 +112,8 @@ func (p *llama) Tensors(ts []Tensor) []llm.Tensor { return out } -func (p *llama) tensorName(n string) string { - return strings.NewReplacer( +func (p *llama) Replacements() []string { + return []string{ "lm_head", "output", "model.embed_tokens", "token_embd", "model.norm", "output_norm", @@ -128,9 +127,7 @@ func (p *llama) tensorName(n string) string { "mlp.down_proj", "ffn_down", "mlp.up_proj", "ffn_up", "post_attention_layernorm", "ffn_norm", - // mixtral - "block_sparse_moe.gate", "ffn_gate_inp", - ).Replace(n) + } } func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32, error) { @@ -140,9 +137,9 @@ func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32, } var heads uint32 - if strings.HasSuffix(name, "q_proj.weight") { + if strings.HasSuffix(name, "attn_q.weight") { heads = p.NumAttentionHeads - } else if strings.HasSuffix(name, "k_proj.weight") { + } else if strings.HasSuffix(name, "attn_k.weight") { heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads) } else { return nil, fmt.Errorf("unknown tensor for repack: %s", name) diff --git a/convert/convert_mixtral.go b/convert/convert_mixtral.go index 3263a27b..97a86b30 100644 --- a/convert/convert_mixtral.go +++ b/convert/convert_mixtral.go @@ -15,8 +15,6 @@ type mixtral struct { NumExpertsPerToken uint32 `json:"num_experts_per_tok"` } -var _ Converter = (*mixtral)(nil) - func (p *mixtral) KV(t *Tokenizer) llm.KV { kv := p.llama.KV(t) @@ -72,6 +70,13 @@ func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor { return append(out, p.llama.Tensors(ts)...) } +func (p *mixtral) Replacements() []string { + return append( + p.llama.Replacements(), + "block_sparse_moe.gate", "ffn_gate_inp", + ) +} + type experts []Tensor func (e experts) WriteTo(w io.Writer) (int64, error) { diff --git a/convert/convert_phi3.go b/convert/convert_phi3.go index 0f645217..4ee59ff5 100644 --- a/convert/convert_phi3.go +++ b/convert/convert_phi3.go @@ -74,8 +74,7 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor { out := make([]llm.Tensor, 0, len(ts)+2) for _, t := range ts { - name := p.tensorName(t.Name()) - if strings.HasPrefix(name, "blk.0.") { + if strings.HasPrefix(t.Name(), "blk.0.") { addRopeFactors.Do(func() { out = append(out, llm.Tensor{ Name: "rope_factors_long.weight", @@ -92,7 +91,7 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor { } out = append(out, llm.Tensor{ - Name: name, + Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), WriterTo: t, @@ -102,8 +101,8 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor { return out } -func (p *phi3) tensorName(n string) string { - return strings.NewReplacer( +func (p *phi3) Replacements() []string { + return []string{ "lm_head", "output", "model.embed_tokens", "token_embd", "model.norm", "output_norm", @@ -114,7 +113,7 @@ func (p *phi3) tensorName(n string) string { "mlp.down_proj", "ffn_down", "mlp.gate_up_proj", "ffn_up", "post_attention_layernorm", "ffn_norm", - ).Replace(n) + } } type ropeFactor []float32 diff --git a/convert/convert_test.go b/convert/convert_test.go index e3ab0098..e78afab7 100644 --- a/convert/convert_test.go +++ b/convert/convert_test.go @@ -68,6 +68,7 @@ func TestConvertFull(t *testing.T) { // microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8 "Phi-3-mini-128k-instruct", "all-MiniLM-L6-v2", + "gemma-2-9b-it", } for i := range cases { diff --git a/convert/reader.go b/convert/reader.go index 294a7c40..5bba0406 100644 --- a/convert/reader.go +++ b/convert/reader.go @@ -35,9 +35,9 @@ const ( ) func (t tensorBase) Kind() uint32 { - if strings.HasSuffix(t.name, ".block_sparse_moe.gate.weight") { - return 0 - } else if t.name == "embeddings.token_type_embeddings.weight" { + if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") || + t.name == "token_types.weight" { + // these tensors are always F32 return 0 } @@ -57,10 +57,10 @@ func (t *tensorBase) SetRepacker(fn repacker) { type repacker func(string, []float32, []uint64) ([]float32, error) -func parseTensors(fsys fs.FS) ([]Tensor, error) { +func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) { patterns := []struct { Pattern string - Func func(fs.FS, ...string) ([]Tensor, error) + Func func(fs.FS, *strings.Replacer, ...string) ([]Tensor, error) }{ {"model-*-of-*.safetensors", parseSafetensors}, {"model.safetensors", parseSafetensors}, @@ -76,7 +76,7 @@ func parseTensors(fsys fs.FS) ([]Tensor, error) { } if len(matches) > 0 { - return pattern.Func(fsys, matches...) + return pattern.Func(fsys, replacer, matches...) } } diff --git a/convert/reader_safetensors.go b/convert/reader_safetensors.go index 42f902a5..32a362cd 100644 --- a/convert/reader_safetensors.go +++ b/convert/reader_safetensors.go @@ -8,6 +8,7 @@ import ( "io" "io/fs" "slices" + "strings" "github.com/d4l3k/go-bfloat16" "github.com/x448/float16" @@ -20,7 +21,7 @@ type safetensorMetadata struct { Offsets []int64 `json:"data_offsets"` } -func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) { +func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) { var ts []Tensor for _, p := range ps { f, err := fsys.Open(p) @@ -56,7 +57,7 @@ func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) { offset: safetensorsPad(n, value.Offsets[0]), size: safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]), tensorBase: &tensorBase{ - name: key, + name: replacer.Replace(key), shape: value.Shape, }, }) diff --git a/convert/reader_torch.go b/convert/reader_torch.go index 531996bf..1b3e1c9f 100644 --- a/convert/reader_torch.go +++ b/convert/reader_torch.go @@ -3,12 +3,13 @@ package convert import ( "io" "io/fs" + "strings" "github.com/nlpodyssey/gopickle/pytorch" "github.com/nlpodyssey/gopickle/types" ) -func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) { +func parseTorch(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) { var ts []Tensor for _, p := range ps { pt, err := pytorch.Load(p) @@ -27,7 +28,7 @@ func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) { ts = append(ts, torch{ storage: t.(*pytorch.Tensor).Source, tensorBase: &tensorBase{ - name: k.(string), + name: replacer.Replace(k.(string)), shape: shape, }, }) diff --git a/convert/testdata/gemma-2-9b-it.json b/convert/testdata/gemma-2-9b-it.json new file mode 100644 index 00000000..90cdbee4 --- /dev/null +++ b/convert/testdata/gemma-2-9b-it.json @@ -0,0 +1,6 @@ +{ + "general.architecture": "gemma2", + "gemma2.attention.sliding_window": "4096", + "gemma2.attn_logit_softcapping": "50", + "gemma2.final_logit_softcapping": "30" +} diff --git a/convert/tokenizer_spm.go b/convert/tokenizer_spm.go index babf702c..5e506087 100644 --- a/convert/tokenizer_spm.go +++ b/convert/tokenizer_spm.go @@ -15,6 +15,11 @@ import ( ) func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) { + ast, err := parseAdditionalSpecialTokens(fsys) + if err != nil { + return nil, err + } + bts, err := fs.ReadFile(fsys, "tokenizer.model") if err != nil { return nil, err @@ -37,7 +42,12 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) { sentencepiece.ModelProto_SentencePiece_BYTE: v.Types = append(v.Types, int32(t)) default: - v.Types = append(v.Types, int32(sentencepiece.ModelProto_SentencePiece_NORMAL)) + tt := int32(sentencepiece.ModelProto_SentencePiece_NORMAL) + if slices.Contains(ast, piece.GetPiece()) { + tt = int32(sentencepiece.ModelProto_SentencePiece_CONTROL) + } + + v.Types = append(v.Types, tt) } } @@ -81,3 +91,23 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) { return &v, nil } + +func parseAdditionalSpecialTokens(fsys fs.FS) ([]string, error) { + f, err := fsys.Open("special_tokens_map.json") + if errors.Is(err, os.ErrNotExist) { + return nil, nil + } else if err != nil { + return nil, err + } + defer f.Close() + + var m struct { + AdditionalSpecialTokens []string `json:"additional_special_tokens"` + } + + if err := json.NewDecoder(f).Decode(&m); err != nil { + return nil, err + } + + return m.AdditionalSpecialTokens, nil +} From 77903ab8b4fb8075faad7bde5bde2eee3173e407 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Mon, 29 Jul 2024 14:53:02 -0700 Subject: [PATCH 70/71] llama3.1 --- convert/convert_bert.go | 1 - convert/convert_gemma.go | 1 - convert/convert_gemma2.go | 1 - convert/convert_llama.go | 43 +++++++++++++++++-- convert/convert_phi3.go | 1 - convert/convert_test.go | 1 + .../testdata/Meta-Llama-3.1-8B-Instruct.json | 3 ++ llm/memory_test.go | 1 - server/sched_test.go | 1 - 9 files changed, 44 insertions(+), 9 deletions(-) create mode 100644 convert/testdata/Meta-Llama-3.1-8B-Instruct.json diff --git a/convert/convert_bert.go b/convert/convert_bert.go index 4547a705..6e7d59fe 100644 --- a/convert/convert_bert.go +++ b/convert/convert_bert.go @@ -88,7 +88,6 @@ func (p *bert) parseMore(fsys fs.FS) error { func (p *bert) KV(t *Tokenizer) llm.KV { kv := p.Parameters.KV(t) kv["general.architecture"] = "bert" - kv["general.name"] = "bert" kv["bert.attention.causal"] = false kv["bert.pooling_type"] = p.PoolingType diff --git a/convert/convert_gemma.go b/convert/convert_gemma.go index 333e4c83..c4316808 100644 --- a/convert/convert_gemma.go +++ b/convert/convert_gemma.go @@ -26,7 +26,6 @@ var _ Converter = (*gemma)(nil) func (p *gemma) KV(t *Tokenizer) llm.KV { kv := p.Parameters.KV(t) kv["general.architecture"] = "gemma" - kv["general.name"] = "gemma" kv["gemma.context_length"] = p.MaxPositionEmbeddings kv["gemma.embedding_length"] = p.HiddenSize kv["gemma.block_count"] = p.HiddenLayers diff --git a/convert/convert_gemma2.go b/convert/convert_gemma2.go index 66be02d6..084f9c52 100644 --- a/convert/convert_gemma2.go +++ b/convert/convert_gemma2.go @@ -14,7 +14,6 @@ type gemma2 struct { func (p *gemma2) KV(t *Tokenizer) llm.KV { kv := p.Parameters.KV(t) kv["general.architecture"] = "gemma2" - kv["general.name"] = "gemma2" kv["gemma2.context_length"] = p.MaxPositionEmbeddings kv["gemma2.embedding_length"] = p.HiddenSize kv["gemma2.block_count"] = p.HiddenLayers diff --git a/convert/convert_llama.go b/convert/convert_llama.go index 498d1321..27f924fb 100644 --- a/convert/convert_llama.go +++ b/convert/convert_llama.go @@ -3,6 +3,7 @@ package convert import ( "cmp" "fmt" + "math" "strings" "github.com/pdevine/tensor" @@ -27,8 +28,14 @@ type llama struct { NumKeyValueHeads uint32 `json:"num_key_value_heads"` RopeTheta float32 `json:"rope_theta"` RopeScaling struct { - Type string `json:"type"` - Factor float32 `json:"factor"` + Type string `json:"type"` + RopeType string `json:"rope_type"` + Factor float32 `json:"factor"` + LowFrequencyFactor float32 `json:"low_freq_factor"` + HighFrequencyFactor float32 `json:"high_freq_factor"` + OriginalMaxPositionalEmbeddings uint32 `json:"original_max_positional_embeddings"` + + factors ropeFactor } `json:"rope_scaling"` RMSNormEPS float32 `json:"rms_norm_eps"` LayerNormEPS float32 `json:"layer_norm_eps"` @@ -42,7 +49,6 @@ var _ Converter = (*llama)(nil) func (p *llama) KV(t *Tokenizer) llm.KV { kv := p.Parameters.KV(t) kv["general.architecture"] = "llama" - kv["general.name"] = "llama" kv["llama.vocab_size"] = p.VocabSize kv["llama.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer) @@ -71,6 +77,27 @@ func (p *llama) KV(t *Tokenizer) llm.KV { if p.RopeScaling.Type == "linear" { kv["llama.rope.scaling.type"] = p.RopeScaling.Type kv["llama.rope.scaling.factor"] = p.RopeScaling.Factor + } else if p.RopeScaling.RopeType == "llama3" { + dim := p.HiddenSize / p.NumAttentionHeads + for i := uint32(0); i < dim; i += 2 { + factor := cmp.Or(p.RopeScaling.Factor, 8.0) + factorLow := cmp.Or(p.RopeScaling.LowFrequencyFactor, 1.0) + factorHigh := cmp.Or(p.RopeScaling.HighFrequencyFactor, 4.0) + + original := cmp.Or(p.RopeScaling.OriginalMaxPositionalEmbeddings, 8192) + lambdaLow := float32(original) / factorLow + lambdaHigh := float32(original) / factorHigh + + lambda := 2 * math.Pi * math.Pow(float64(p.RopeTheta), float64(i)/float64(dim)) + if lambda < float64(lambdaHigh) { + p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0) + } else if lambda > float64(lambdaLow) { + p.RopeScaling.factors = append(p.RopeScaling.factors, factor) + } else { + smooth := (float32(original)/float32(lambda) - factorLow) / (factorHigh - factorLow) + p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0/((1-smooth)/factor+smooth)) + } + } } if p.NumKeyValueHeads > 0 { @@ -95,6 +122,16 @@ func (p *llama) KV(t *Tokenizer) llm.KV { func (p *llama) Tensors(ts []Tensor) []llm.Tensor { var out []llm.Tensor + + if p.RopeScaling.factors != nil { + out = append(out, llm.Tensor{ + Name: "rope_freqs.weight", + Kind: 0, + Shape: []uint64{uint64(len(p.RopeScaling.factors))}, + WriterTo: p.RopeScaling.factors, + }) + } + for _, t := range ts { if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") { diff --git a/convert/convert_phi3.go b/convert/convert_phi3.go index 4ee59ff5..64d3d012 100644 --- a/convert/convert_phi3.go +++ b/convert/convert_phi3.go @@ -40,7 +40,6 @@ var _ Converter = (*phi3)(nil) func (p *phi3) KV(t *Tokenizer) llm.KV { kv := p.Parameters.KV(t) kv["general.architecture"] = "phi3" - kv["general.name"] = "phi3" kv["phi3.context_length"] = p.MaxPositionEmbeddings kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd) kv["phi3.feed_forward_length"] = p.IntermediateSize diff --git a/convert/convert_test.go b/convert/convert_test.go index e78afab7..64b7df3b 100644 --- a/convert/convert_test.go +++ b/convert/convert_test.go @@ -62,6 +62,7 @@ func TestMain(m *testing.M) { func TestConvertFull(t *testing.T) { cases := []string{ "Meta-Llama-3-8B-Instruct", + "Meta-Llama-3.1-8B-Instruct", "Mistral-7B-Instruct-v0.2", "Mixtral-8x7B-Instruct-v0.1", "gemma-2b-it", diff --git a/convert/testdata/Meta-Llama-3.1-8B-Instruct.json b/convert/testdata/Meta-Llama-3.1-8B-Instruct.json new file mode 100644 index 00000000..ad7cd20a --- /dev/null +++ b/convert/testdata/Meta-Llama-3.1-8B-Instruct.json @@ -0,0 +1,3 @@ +{ + "rope_freqs.weight": "80fd5efb2f729381785b293a091a268cfeceb0079167f6ece9b07070e662b222" +} diff --git a/llm/memory_test.go b/llm/memory_test.go index 6cf0119f..ffb14286 100644 --- a/llm/memory_test.go +++ b/llm/memory_test.go @@ -33,7 +33,6 @@ func TestEstimateGPULayers(t *testing.T) { assert.Len(t, tensors, inputLayerCount+1) err = WriteGGUF(f, KV{ "general.architecture": "llama", - "general.name": "name", "llama.context_length": uint32(32), "llama.embedding_length": uint32(4096), "llama.block_count": uint32(inputLayerCount), diff --git a/server/sched_test.go b/server/sched_test.go index 713b9259..fb049574 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -117,7 +117,6 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est require.NoError(t, llm.WriteGGUF(f, llm.KV{ "general.architecture": "llama", - "general.name": "name", "llama.context_length": uint32(32), "llama.embedding_length": uint32(4096), "llama.block_count": uint32(1), From 90ca84172c2a98ecfd76eb7e05cd3e33e1dde507 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 22 Aug 2024 14:51:42 -0700 Subject: [PATCH 71/71] Fix embeddings memory corruption (#6467) * Fix embeddings memory corruption The patch was leading to a buffer overrun corruption. Once removed though, parallism in server.cpp lead to hitting an assert due to slot/seq IDs being >= token count. To work around this, only use slot 0 for embeddings. * Fix embed integration test assumption The token eval count has changed with recent llama.cpp bumps (0.3.5+) --- integration/embed_test.go | 8 ++--- llm/ext_server/server.cpp | 8 ++++- llm/patches/08-pooling.diff | 60 ------------------------------------- server/sched.go | 5 ++++ 4 files changed, 16 insertions(+), 65 deletions(-) delete mode 100644 llm/patches/08-pooling.diff diff --git a/integration/embed_test.go b/integration/embed_test.go index 10333d5d..4a68af68 100644 --- a/integration/embed_test.go +++ b/integration/embed_test.go @@ -70,8 +70,8 @@ func TestAllMiniLMEmbed(t *testing.T) { t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0]) } - if res.PromptEvalCount != 8 { - t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount) + if res.PromptEvalCount != 6 { + t.Fatalf("expected 6 prompt tokens, got %d", res.PromptEvalCount) } } @@ -102,8 +102,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) { t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0]) } - if res.PromptEvalCount != 16 { - t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount) + if res.PromptEvalCount != 12 { + t.Fatalf("expected 12 prompt tokens, got %d", res.PromptEvalCount) } } diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 5717c17a..8e08b850 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -1429,7 +1429,13 @@ struct llama_server_context switch (task.type) { case TASK_TYPE_COMPLETION: { - server_slot *slot = prefix_slot(task.data["prompt"]); + server_slot *slot = nullptr; + if (task.embedding_mode) { + // Embedding seq_id (aka slot id) must always be <= token length, so always use slot 0 + slot = slots[0].available() ? &slots[0] : nullptr; + } else { + slot = prefix_slot(task.data["prompt"]); + } if (slot == nullptr) { // if no slot is available, we defer this task for processing later diff --git a/llm/patches/08-pooling.diff b/llm/patches/08-pooling.diff deleted file mode 100644 index 2e4fe11e..00000000 --- a/llm/patches/08-pooling.diff +++ /dev/null @@ -1,60 +0,0 @@ -diff --git a/src/llama.cpp b/src/llama.cpp -index 721b8f4e..cfe7ac40 100644 ---- a/src/llama.cpp -+++ b/src/llama.cpp -@@ -8420,14 +8420,14 @@ struct llm_build_context { - } - - struct ggml_tensor * build_inp_mean() { -- lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); -+ lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, cparams.n_seq_max); - cb(lctx.inp_mean, "inp_mean", -1); - ggml_set_input(lctx.inp_mean); - return lctx.inp_mean; - } - - struct ggml_tensor * build_inp_cls() { -- lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); -+ lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cparams.n_seq_max); - cb(lctx.inp_cls, "inp_cls", -1); - ggml_set_input(lctx.inp_cls); - return lctx.inp_cls; -@@ -13847,19 +13847,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer)); - - float * data = (float *) lctx.inp_mean->data; -- memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean)); -+ memset(lctx.inp_mean->data, 0, n_tokens * cparams.n_seq_max * ggml_element_size(lctx.inp_mean)); - - std::vector sum(n_tokens, 0); - for (int i = 0; i < n_tokens; ++i) { - const llama_seq_id seq_id = batch.seq_id[i][0]; -- -- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN"); -- - sum[seq_id] += 1; - } - -- std::vector div(n_tokens, 0.0f); -- for (int i = 0; i < n_tokens; ++i) { -+ std::vector div(cparams.n_seq_max, 0.0f); -+ for (uint32_t i = 0; i < cparams.n_seq_max; ++i) { - const uint64_t s = sum[i]; - if (s > 0) { - div[i] = 1.0f/float(s); -@@ -13879,14 +13876,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); - - uint32_t * data = (uint32_t *) lctx.inp_cls->data; -- memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls)); -+ memset(lctx.inp_cls->data, 0, cparams.n_seq_max * ggml_element_size(lctx.inp_cls)); - - for (int i = 0; i < n_tokens; ++i) { - const llama_seq_id seq_id = batch.seq_id[i][0]; - const llama_pos pos = batch.pos[i]; -- -- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS"); -- - if (pos == 0) { - data[seq_id] = i; - } diff --git a/server/sched.go b/server/sched.go index 9d8c4144..58071bf0 100644 --- a/server/sched.go +++ b/server/sched.go @@ -193,6 +193,11 @@ func (s *Scheduler) processPending(ctx context.Context) { break } + // Embedding models should always be loaded with parallel=1 + if pending.model.CheckCapabilities(CapabilityCompletion) != nil { + numParallel = 1 + } + // Evaluate if the model will fit in the available system memory, or if we should unload a model first if len(gpus) == 1 && gpus[0].Library == "cpu" { // simplifying assumption of defaultParallel when in CPU mode