From df3802a65fa5876708fa03e0369248bd0ea4b2c4 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Thu, 1 Aug 2024 17:22:25 -0700
Subject: [PATCH 01/71] Adjust arm cuda repo paths

Ubuntu distros fail to install cuda drivers since aarch64 isn't valid
---
 scripts/install.sh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/install.sh b/scripts/install.sh
index aa8b3e5e..03af5a69 100644
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -209,15 +209,15 @@ install_cuda_driver_yum() {
     case $PACKAGE_MANAGER in
         yum)
             $SUDO $PACKAGE_MANAGER -y install yum-utils
-            if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo" >/dev/null ; then
-                $SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
+            if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m | sed -e 's/aarch64/sbsa/')/cuda-$1$2.repo" >/dev/null ; then
+                $SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m | sed -e 's/aarch64/sbsa/')/cuda-$1$2.repo
             else
                 error $CUDA_REPO_ERR_MSG
             fi
             ;;
         dnf)
-            if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo" >/dev/null ; then
-                $SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
+            if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m | sed -e 's/aarch64/sbsa/')/cuda-$1$2.repo" >/dev/null ; then
+                $SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m | sed -e 's/aarch64/sbsa/')/cuda-$1$2.repo
             else
                 error $CUDA_REPO_ERR_MSG
             fi
@@ -245,8 +245,8 @@ install_cuda_driver_yum() {
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#debian
 install_cuda_driver_apt() {
     status 'Installing NVIDIA repository...'
-    if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb" >/dev/null ; then
-        curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb
+    if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m | sed -e 's/aarch64/sbsa/')/cuda-keyring_1.1-1_all.deb" >/dev/null ; then
+        curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m | sed -e 's/aarch64/sbsa/')/cuda-keyring_1.1-1_all.deb
     else
         error $CUDA_REPO_ERR_MSG
     fi

From a091fadfdaa2e4d6a34cf8bbfe4012913367a35a Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Fri, 2 Aug 2024 15:55:34 -0700
Subject: [PATCH 02/71] use testing tempdirs

---
 server/routes_create_test.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/server/routes_create_test.go b/server/routes_create_test.go
index 9fd7f8cd..4de07b25 100644
--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@@ -2,6 +2,7 @@ package server
 
 import (
 	"bytes"
+	"cmp"
 	"encoding/json"
 	"fmt"
 	"io"
@@ -53,6 +54,8 @@ func (t *responseRecorder) CloseNotify() <-chan bool {
 
 func createRequest(t *testing.T, fn func(*gin.Context), body any) *httptest.ResponseRecorder {
 	t.Helper()
+	// if OLLAMA_MODELS is not set, set it to the temp directory
+	t.Setenv("OLLAMA_MODELS", cmp.Or(os.Getenv("OLLAMA_MODELS"), t.TempDir()))
 
 	w := NewRecorder()
 	c, _ := gin.CreateTestContext(w)

From ed6c8bfe57e4678090b89fc8f6c4e08ce1b01040 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 5 Aug 2024 00:02:47 -0700
Subject: [PATCH 03/71] removeall to remove non-empty temp dirs

---
 gpu/assets.go | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/gpu/assets.go b/gpu/assets.go
index a35b6630..1c33b55b 100644
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -67,37 +67,44 @@ func PayloadsDir() (string, error) {
 
 // Best effort to clean up prior tmpdirs
 func cleanupTmpDirs() {
-	dirs, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*"))
+	matches, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*", "ollama.pid"))
 	if err != nil {
 		return
 	}
-	for _, d := range dirs {
-		info, err := os.Stat(d)
-		if err != nil || !info.IsDir() {
+
+	for _, match := range matches {
+		raw, err := os.ReadFile(match)
+		if errors.Is(err, os.ErrNotExist) {
+			slog.Debug("not a ollama runtime directory, skipping", "path", match)
 			continue
-		}
-		raw, err := os.ReadFile(filepath.Join(d, "ollama.pid"))
-		if err != nil {
-			slog.Warn("failed to read ollama.pid", "path", d, "error", err)
-			// No pid, ignore this tmpdir
+		} else if err != nil {
+			slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err)
 			continue
 		}
 
 		pid, err := strconv.Atoi(string(raw))
 		if err != nil {
-			slog.Warn("failed to parse pid", "path", d, "error", err)
+			slog.Warn("invalid pid, skipping", "path", match, "error", err)
 			continue
 		}
 
-		proc, err := os.FindProcess(pid)
-		if err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
-			slog.Warn("found running ollama", "pid", pid, "path", d)
-			// Another running ollama, ignore this tmpdir
+		p, err := os.FindProcess(pid)
+		if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) {
+			slog.Warn("process still running, skipping", "pid", pid, "path", match)
 			continue
 		}
 
-		if err := os.Remove(d); err != nil {
-			slog.Warn("unable to cleanup stale tmpdir", "path", d, "error", err)
+		if err := os.Remove(match); err != nil {
+			slog.Warn("could not cleanup stale pidfile", "path", match, "error", err)
+		}
+
+		runners := filepath.Join(filepath.Dir(match), "runners")
+		if err := os.RemoveAll(runners); err != nil {
+			slog.Warn("could not cleanup stale runners", "path", runners, "error", err)
+		}
+
+		if err := os.Remove(filepath.Dir(match)); err != nil {
+			slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err)
 		}
 	}
 }

From 43f9d92008bf1aaa2e89ca50c85761540f70c21a Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 5 Aug 2024 00:34:09 -0700
Subject: [PATCH 04/71] close pid file

---
 gpu/assets.go | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/gpu/assets.go b/gpu/assets.go
index 1c33b55b..6d62d0dc 100644
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -49,13 +49,9 @@ func PayloadsDir() (string, error) {
 		}
 
 		// Track our pid so we can clean up orphaned tmpdirs
-		pidFilePath := filepath.Join(tmpDir, "ollama.pid")
-		pidFile, err := os.OpenFile(pidFilePath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, os.ModePerm)
-		if err != nil {
-			return "", err
-		}
-		if _, err := pidFile.Write([]byte(strconv.Itoa(os.Getpid()))); err != nil {
-			return "", err
+		n := filepath.Join(tmpDir, "ollama.pid")
+		if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil {
+			return "", fmt.Errorf("failed to write pid file %s: %w", n, err)
 		}
 
 		// We create a distinct subdirectory for payloads within the tmpdir

From 04210aa6ddf9ec5d5b6101f6e8a12b68d7aadfee Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Mon, 5 Aug 2024 09:28:07 -0700
Subject: [PATCH 05/71] Catch one more error log

---
 llm/status.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llm/status.go b/llm/status.go
index d9f36115..604fe9e0 100644
--- a/llm/status.go
+++ b/llm/status.go
@@ -26,6 +26,7 @@ var errorPrefixes = []string{
 	"cudaMalloc failed",
 	"\"ERR\"",
 	"error loading model",
+	"GGML_ASSERT",
 }
 
 func (w *StatusWriter) Write(b []byte) (int, error) {

From f457d63400f9859acdfff1853c53af13429acea5 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Mon, 5 Aug 2024 12:56:20 -0700
Subject: [PATCH 06/71] Implement linux NUMA detection

If the system has multiple numa nodes, enable numa support in llama.cpp
If we detect numactl in the path, use that, else use the basic "distribute" mode.
---
 api/types.go      |  2 --
 gpu/cpu_common.go | 21 +++++++++++++++++++++
 llm/server.go     | 10 ++++++++--
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/api/types.go b/api/types.go
index c2529652..291522a3 100644
--- a/api/types.go
+++ b/api/types.go
@@ -231,7 +231,6 @@ type Options struct {
 
 // Runner options which must be set when the model is loaded into memory
 type Runner struct {
-	UseNUMA   bool  `json:"numa,omitempty"`
 	NumCtx    int   `json:"num_ctx,omitempty"`
 	NumBatch  int   `json:"num_batch,omitempty"`
 	NumGPU    int   `json:"num_gpu,omitempty"`
@@ -615,7 +614,6 @@ func DefaultOptions() Options {
 			F16KV:     true,
 			UseMLock:  false,
 			UseMMap:   nil,
-			UseNUMA:   false,
 		},
 	}
 }
diff --git a/gpu/cpu_common.go b/gpu/cpu_common.go
index 63e88f25..34edcdc5 100644
--- a/gpu/cpu_common.go
+++ b/gpu/cpu_common.go
@@ -1,6 +1,11 @@
 package gpu
 
 import (
+	"os"
+	"path/filepath"
+	"runtime"
+	"strings"
+
 	"golang.org/x/sys/cpu"
 )
 
@@ -14,3 +19,19 @@ func GetCPUCapability() CPUCapability {
 	// else LCD
 	return CPUCapabilityNone
 }
+
+func IsNUMA() bool {
+	if runtime.GOOS != "linux" {
+		// numa support in llama.cpp is linux only
+		return false
+	}
+	ids := map[string]interface{}{}
+	packageIds, _ := filepath.Glob("/sys/devices/system/cpu/cpu*/topology/physical_package_id")
+	for _, packageId := range packageIds {
+		id, err := os.ReadFile(packageId)
+		if err == nil {
+			ids[strings.TrimSpace(string(id))] = struct{}{}
+		}
+	}
+	return len(ids) > 1
+}
diff --git a/llm/server.go b/llm/server.go
index 7abc3bd7..152b7582 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -256,8 +256,14 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--mlock")
 	}
 
-	if opts.UseNUMA {
-		params = append(params, "--numa")
+	if gpu.IsNUMA() {
+		numaMode := "distribute"
+		if runtime.GOOS == "linux" {
+			if _, err := exec.LookPath("numactl"); err == nil {
+				numaMode = "numactl"
+			}
+		}
+		params = append(params, "--numa", numaMode)
 	}
 
 	params = append(params, "--parallel", strconv.Itoa(numParallel))

From 7ed367419e8fee28c393f1f80edfb5686fddaed6 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 5 Aug 2024 16:34:54 -0700
Subject: [PATCH 07/71] fix concurrency test

---
 integration/concurrency_test.go | 19 +++++++++----------
 integration/llm_test.go         |  4 ++--
 integration/max_queue_test.go   |  2 +-
 integration/utils_test.go       | 10 +++++-----
 4 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/integration/concurrency_test.go b/integration/concurrency_test.go
index 81d0b587..42e9d074 100644
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -5,6 +5,7 @@ package integration
 import (
 	"context"
 	"log/slog"
+	"os"
 	"strconv"
 	"sync"
 	"testing"
@@ -13,7 +14,6 @@ import (
 	"github.com/stretchr/testify/require"
 
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 )
 
@@ -41,8 +41,8 @@ func TestMultiModelConcurrency(t *testing.T) {
 			},
 		}
 		resp = [2][]string{
-			[]string{"sunlight"},
-			[]string{"england", "english", "massachusetts", "pilgrims", "british"},
+			{"sunlight"},
+			{"england", "english", "massachusetts", "pilgrims", "british"},
 		}
 	)
 	var wg sync.WaitGroup
@@ -71,12 +71,11 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
 	reqLimit := len(req)
 	iterLimit := 5
 
-	vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
-	if vram != "" {
-		max, err := strconv.ParseUint(vram, 10, 64)
+	if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" {
+		maxVram, err := strconv.ParseUint(s, 10, 64)
 		require.NoError(t, err)
 		// Don't hammer on small VRAM cards...
-		if max < 4*1024*1024*1024 {
+		if maxVram < 4*format.GibiByte {
 			reqLimit = min(reqLimit, 2)
 			iterLimit = 2
 		}
@@ -233,12 +232,12 @@ func TestMultiModelStress(t *testing.T) {
 	consumed := uint64(256 * format.MebiByte) // Assume some baseline usage
 	for i := 0; i < len(req); i++ {
 		// Always get at least 2 models, but dont' overshoot VRAM too much or we'll take too long
-		if i > 1 && consumed > vram {
-			slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(vram), "models", format.HumanBytes2(consumed))
+		if i > 1 && consumed > maxVram {
+			slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
 			break
 		}
 		consumed += chosenModels[i].size
-		slog.Info("target vram", "count", i, "vram", format.HumanBytes2(vram), "models", format.HumanBytes2(consumed))
+		slog.Info("target vram", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
 
 		wg.Add(1)
 		go func(i int) {
diff --git a/integration/llm_test.go b/integration/llm_test.go
index 4952b072..398e0a03 100644
--- a/integration/llm_test.go
+++ b/integration/llm_test.go
@@ -35,8 +35,8 @@ var (
 		},
 	}
 	resp = [2][]string{
-		[]string{"sunlight"},
-		[]string{"england", "english", "massachusetts", "pilgrims"},
+		{"sunlight"},
+		{"england", "english", "massachusetts", "pilgrims"},
 	}
 )
 
diff --git a/integration/max_queue_test.go b/integration/max_queue_test.go
index b06197e1..ec9e085a 100644
--- a/integration/max_queue_test.go
+++ b/integration/max_queue_test.go
@@ -29,7 +29,7 @@ func TestMaxQueue(t *testing.T) {
 	// Also note that by default Darwin can't sustain > ~128 connections without adjusting limits
 	threadCount := 32
 	if maxQueue := envconfig.MaxQueue(); maxQueue != 0 {
-		threadCount = maxQueue
+		threadCount = int(maxQueue)
 	} else {
 		t.Setenv("OLLAMA_MAX_QUEUE", strconv.Itoa(threadCount))
 	}
diff --git a/integration/utils_test.go b/integration/utils_test.go
index c2b27ee9..a6010995 100644
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -334,10 +334,10 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
 			},
 		},
 		[][]string{
-			[]string{"sunlight"},
-			[]string{"soil", "organic", "earth", "black", "tan"},
-			[]string{"england", "english", "massachusetts", "pilgrims", "british"},
-			[]string{"fourth", "july", "declaration", "independence"},
-			[]string{"nitrogen", "oxygen", "carbon", "dioxide"},
+			{"sunlight"},
+			{"soil", "organic", "earth", "black", "tan"},
+			{"england", "english", "massachusetts", "pilgrims", "british"},
+			{"fourth", "july", "declaration", "independence"},
+			{"nitrogen", "oxygen", "carbon", "dioxide"},
 		}
 }

From 86b907f82ad1cc5eb16e919d6cb5830765d73be4 Mon Sep 17 00:00:00 2001
From: royjhan <65097070+royjhan@users.noreply.github.com>
Date: Mon, 5 Aug 2024 19:55:34 -0400
Subject: [PATCH 08/71] sort batch results (#6189)

---
 llm/ext_server/server.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp
index d72bb1b1..071fe1e7 100644
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -44,6 +44,7 @@
 #include <errhandlingapi.h>
 #endif
 
+#include <algorithm>
 #include <cstddef>
 #include <thread>
 #include <chrono>
@@ -1220,6 +1221,7 @@ struct llama_server_context
 
                 res.result_json = json
                 {
+                    {"id", res.id},
                     {"embedding", std::vector<float>(embd, embd + n_embd)},
                     {"timings",             slot.get_formated_timings()},
                 };
@@ -3203,6 +3205,10 @@ int main(int argc, char **argv) {
                     }
 
                     responses = result.result_json.value("results", std::vector<json>{result.result_json});
+                    std::sort(responses.begin(), responses.end(), [](const json& a, const json& b) {
+                        return a["id"] < b["id"];
+                    });
+
                     json embeddings = json::array();
 
                     int prompt_n = 0;

From fc85f50a2be9ba8776547de9db02c5373719eb13 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Tue, 6 Aug 2024 10:46:31 -0700
Subject: [PATCH 09/71] Ensure sparse files on windows during download

The file.Truncate call on windows will write the whole file
unless you set the sparse flag, leading to heavy I/O at the
beginning of download.  This should improve our
I/O behavior on windows and put less stress on the users disk.
---
 server/download.go       |  3 +++
 server/sparse_common.go  |  9 +++++++++
 server/sparse_windows.go | 16 ++++++++++++++++
 3 files changed, 28 insertions(+)
 create mode 100644 server/sparse_common.go
 create mode 100644 server/sparse_windows.go

diff --git a/server/download.go b/server/download.go
index a903d96f..38d24a6b 100644
--- a/server/download.go
+++ b/server/download.go
@@ -216,6 +216,9 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 		return err
 	}
 	defer file.Close()
+	if err := setSparse(file); err != nil {
+		return err
+	}
 
 	_ = file.Truncate(b.Total)
 
diff --git a/server/sparse_common.go b/server/sparse_common.go
new file mode 100644
index 00000000..f25627fc
--- /dev/null
+++ b/server/sparse_common.go
@@ -0,0 +1,9 @@
+//go:build !windows
+
+package server
+
+import "os"
+
+func setSparse(file *os.File) error {
+	return nil
+}
diff --git a/server/sparse_windows.go b/server/sparse_windows.go
new file mode 100644
index 00000000..cdad379e
--- /dev/null
+++ b/server/sparse_windows.go
@@ -0,0 +1,16 @@
+package server
+
+import (
+	"os"
+
+	"golang.org/x/sys/windows"
+)
+
+func setSparse(file *os.File) error {
+	return windows.DeviceIoControl(
+		windows.Handle(file.Fd()), windows.FSCTL_SET_SPARSE,
+		nil, 0,
+		nil, 0,
+		nil, nil,
+	)
+}

From d4a7216c82bb406e644c739281ade3f7f2e283e5 Mon Sep 17 00:00:00 2001
From: Chua Chee Seng <cheeseng@amaseng.com>
Date: Wed, 7 Aug 2024 02:37:16 +0800
Subject: [PATCH 10/71] Fixed invalid option provided not displaying the
 invalid option name problem. (#6202)

---
 api/types.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/api/types.go b/api/types.go
index 291522a3..2f5a9424 100644
--- a/api/types.go
+++ b/api/types.go
@@ -504,7 +504,7 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
 	for key, val := range m {
 		opt, ok := jsonOpts[key]
 		if !ok {
-			slog.Warn("invalid option provided", "option", opt.Name)
+			slog.Warn("invalid option provided", "option", key)
 			continue
 		}
 

From e04c7012c235d8972afe5538ff27802c77217b83 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Tue, 6 Aug 2024 15:11:45 -0400
Subject: [PATCH 11/71] update llama.cpp submodule to `1e6f6554` (#6208)

---
 llm/ext_server/server.cpp  | 14 +++++++++++---
 llm/llama.cpp              |  2 +-
 llm/patches/09-lora.diff   | 34 +++++++++++++---------------------
 llm/patches/10-params.diff | 20 --------------------
 4 files changed, 25 insertions(+), 45 deletions(-)
 delete mode 100644 llm/patches/10-params.diff

diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp
index 071fe1e7..c65901c7 100644
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -403,7 +403,9 @@ struct llama_server_context
             }
         }
 
-        std::tie(model, ctx) = llama_init_from_gpt_params(params);
+        auto init_result = llama_init_from_gpt_params(params);
+        model = init_result.model;
+        ctx = init_result.context;
         if (model == nullptr)
         {
             LOG_ERROR("unable to load model", {{"model", params.model}});
@@ -2422,7 +2424,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
                 invalid_param = true;
                 break;
             }
-            params.lora_adapter.emplace_back(argv[i], 1.0f);
+            params.lora_adapters.push_back({
+                std::string(argv[i]),
+                1.0,
+            });
             params.use_mmap = false;
         }
         else if (arg == "--lora-scaled")
@@ -2438,7 +2443,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
                 invalid_param = true;
                 break;
             }
-            params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
+            params.lora_adapters.push_back({
+                lora_adapter,
+                std::stof(argv[i])
+            });
             params.use_mmap = false;
         }
         else if (arg == "-v" || arg == "--verbose")
diff --git a/llm/llama.cpp b/llm/llama.cpp
index 6eeaeba1..1e6f6554 160000
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
@@ -1 +1 @@
-Subproject commit 6eeaeba126ff701f3e8f79f246805b7023709972
+Subproject commit 1e6f6554aa11fa10160a5fda689e736c3c34169f
diff --git a/llm/patches/09-lora.diff b/llm/patches/09-lora.diff
index 10c66d1d..21958476 100644
--- a/llm/patches/09-lora.diff
+++ b/llm/patches/09-lora.diff
@@ -1,40 +1,32 @@
 diff --git a/common/common.cpp b/common/common.cpp
-index dbb724fb..c26fe6ee 100644
+index 2e8374d5..70d0afde 100644
 --- a/common/common.cpp
 +++ b/common/common.cpp
-@@ -2087,14 +2087,27 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
-     for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
-         const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
-         float lora_scale = std::get<1>(params.lora_adapter[i]);
-+
-+        // try to load as gguf
-         auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
-         if (adapter == nullptr) {
--            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+@@ -2110,9 +2110,21 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
+         loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
+         if (loaded_la.adapter == nullptr) {
+             fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
 -            llama_free(lctx);
 -            llama_free_model(model);
--            return std::make_tuple(nullptr, nullptr);
-+            fprintf(stderr, "%s: error: failed to apply lora adapter, trying ggla\n", __func__);
+-            return iparams;
 +
 +            // if that fails, try loading as ggla for compatibility
 +            int err = llama_model_apply_lora_from_file(model,
-+                                                    lora_adapter.c_str(),
-+                                                    lora_scale,
++                                                    la.path.c_str(),
++                                                    la.scale,
 +                                                    nullptr,
 +                                                    params.n_threads);
 +            if (err != 0) {
 +                fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
 +                llama_free(lctx);
 +                llama_free_model(model);
-+                return std::make_tuple(nullptr, nullptr);
++                return iparams;
++            } else {
++                break;
 +            }
-+        } else {
-+            llama_lora_adapter_set(lctx, adapter, lora_scale);
          }
--        llama_lora_adapter_set(lctx, adapter, lora_scale);
+         iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
      }
- 
-     if (params.ignore_eos) {
 diff --git a/include/llama.h b/include/llama.h
 index 93fd77ca..b0fb37a6 100644
 --- a/include/llama.h
@@ -355,4 +347,4 @@ index 80a0dd0f..9d7b0e17 100644
 +        return 1;
 +    }
 +}
-\ No newline at end of file
+\ No newline at end of file
\ No newline at end of file
diff --git a/llm/patches/10-params.diff b/llm/patches/10-params.diff
deleted file mode 100644
index 56699b8e..00000000
--- a/llm/patches/10-params.diff
+++ /dev/null
@@ -1,20 +0,0 @@
-diff --git a/src/llama.cpp b/src/llama.cpp
-index a207451f..fba6b175 100644
---- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -4969,6 +4969,7 @@ static void llm_load_hparams(
-                 hparams.attn_soft_cap = true;
- 
-                 switch (hparams.n_layer) {
-+                    case 26: model.type = e_model::MODEL_2B; break;
-                     case 42: model.type = e_model::MODEL_9B; break;
-                     case 46: model.type = e_model::MODEL_27B; break;
-                     default: model.type = e_model::MODEL_UNKNOWN;
-@@ -11736,6 +11737,7 @@ struct llm_build_context {
- 
-                 // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
-                 switch (model.type) {
-+                    case e_model::MODEL_2B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
-                     case e_model::MODEL_9B:  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));   break;
-                     case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
-                     default: GGML_ABORT("fatal error");

From de4fc297732cb60ff79a6c8010a7c79971c21b4a Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Tue, 6 Aug 2024 23:20:49 -0400
Subject: [PATCH 12/71] llm: reserve required number of slots for embeddings
 (#6219)

---
 llm/server.go | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/llm/server.go b/llm/server.go
index 152b7582..41736068 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -44,11 +44,12 @@ type LlamaServer interface {
 
 // llmServer is an instance of the llama.cpp server
 type llmServer struct {
-	port    int
-	cmd     *exec.Cmd
-	done    chan error // Channel to signal when the process exits
-	status  *StatusWriter
-	options api.Options
+	port        int
+	cmd         *exec.Cmd
+	done        chan error // Channel to signal when the process exits
+	status      *StatusWriter
+	options     api.Options
+	numParallel int
 
 	estimate    MemoryEstimate
 	totalLayers uint64
@@ -343,6 +344,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 			status:      NewStatusWriter(os.Stderr),
 			options:     opts,
 			estimate:    estimate,
+			numParallel: numParallel,
 			sem:         semaphore.NewWeighted(int64(numParallel)),
 			totalLayers: ggml.KV().BlockCount() + 1,
 			gpus:        gpus,
@@ -890,11 +892,14 @@ type EmbedResponse struct {
 }
 
 func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse, error) {
-	if err := s.sem.Acquire(ctx, 1); err != nil {
+	// each input will use a slot, so we need to acquire the semaphore for
+	// the number of inputs up to numParallel
+	slots := int64(min(len(input), s.numParallel))
+	if err := s.sem.Acquire(ctx, slots); err != nil {
 		slog.Error("Failed to acquire semaphore", "error", err)
 		return nil, err
 	}
-	defer s.sem.Release(1)
+	defer s.sem.Release(slots)
 
 	// Make sure the server is ready
 	status, err := s.getServerStatusRetry(ctx)

From 685a53534b80a14efdfdb09ca00af984782ba6ee Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Thu, 1 Aug 2024 15:05:16 -0700
Subject: [PATCH 13/71] manifest: Don't prune layers if we can't open a
 manifest file

If there is an error when opening a manifest file (corrupted, permission denied, etc.)
then the referenced layers will not be included in the list of active
layers. This causes them to be deleted when pruning happens at startup
or a model is pulled.

In such a situation, we should prefer to preserve data in the hopes that
it can be recovered rather than being agressive about deletion.
---
 server/images.go | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/server/images.go b/server/images.go
index 81357f3c..05875a88 100644
--- a/server/images.go
+++ b/server/images.go
@@ -714,8 +714,7 @@ func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{})
 		// save (i.e. delete from the deleteMap) any files used in other manifests
 		manifest, _, err := GetManifest(fmp)
 		if err != nil {
-			//nolint:nilerr
-			return nil
+			return err
 		}
 
 		for _, layer := range manifest.Layers {
@@ -782,7 +781,8 @@ func PruneLayers() error {
 
 	err = deleteUnusedLayers(nil, deleteMap)
 	if err != nil {
-		return err
+		slog.Info(fmt.Sprintf("couldn't remove unused layers: %v", err))
+		return nil
 	}
 
 	slog.Info(fmt.Sprintf("total unused blobs removed: %d", len(deleteMap)))
@@ -971,7 +971,8 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 		fn(api.ProgressResponse{Status: "removing any unused layers"})
 		err = deleteUnusedLayers(nil, deleteMap)
 		if err != nil {
-			return err
+			slog.Info(fmt.Sprintf("couldn't remove unused layers: %v", err))
+			fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't remove unused layers: %v", err)})
 		}
 	}
 

From ce67706037a2583157fcac4cbf6253fe0f1e5139 Mon Sep 17 00:00:00 2001
From: Nicholas Schwab <git@nicholas-schwab.de>
Date: Wed, 7 Aug 2024 18:15:17 +0200
Subject: [PATCH 14/71] Set *.png and *.ico to be treated as binary files.

The change b732beba6 makes all files text files and sets lf as eol. This
will automatically change all files to have lf if they are touched by
git (e.g. via git status). This change cannot be stashed and makes it
hard to work with the repo (rebase and checkout don't really work). See
also #6183.

Here, we set the offending files (*.png and *.ico, but that might be
more in the future) to be treated as binary files and not be changed by
git.
---
 .gitattributes | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitattributes b/.gitattributes
index f7192096..648c78ca 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +1,4 @@
 llm/ext_server/* linguist-vendored
 * text eol=lf
+*.png binary
+*.ico binary

From 1829fb61bd7a4186881714618f09b2877d0bc9a3 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Mon, 5 Aug 2024 17:13:52 -0700
Subject: [PATCH 15/71] manifest: Fix crash on startup when trying to clean up
 unused files (#5840)

Currently if the config field is missing in the manifest file (or
corrupted), Ollama will crash when it tries to read it. This can
happen at startup or when pulling new models.

This data is mostly just used for showing model information so we
can be tolerant of it not being present - it is not required to
run the models. Besides avoiding crashing, this also gives us the
ability to restructure the config in the future by pulling it
into the main manifest file.
---
 server/images.go   | 40 ++++++++++++++++++++++++----------------
 server/layer.go    | 15 ++++++++++++++-
 server/manifest.go | 18 ++++++++++--------
 server/routes.go   | 23 +++++++++++++----------
 4 files changed, 61 insertions(+), 35 deletions(-)

diff --git a/server/images.go b/server/images.go
index 05875a88..7ed35995 100644
--- a/server/images.go
+++ b/server/images.go
@@ -250,19 +250,21 @@ func GetModel(name string) (*Model, error) {
 		Template:  template.DefaultTemplate,
 	}
 
-	filename, err := GetBlobsPath(manifest.Config.Digest)
-	if err != nil {
-		return nil, err
-	}
+	if manifest.Config.Digest != "" {
+		filename, err := GetBlobsPath(manifest.Config.Digest)
+		if err != nil {
+			return nil, err
+		}
 
-	configFile, err := os.Open(filename)
-	if err != nil {
-		return nil, err
-	}
-	defer configFile.Close()
+		configFile, err := os.Open(filename)
+		if err != nil {
+			return nil, err
+		}
+		defer configFile.Close()
 
-	if err := json.NewDecoder(configFile).Decode(&model.Config); err != nil {
-		return nil, err
+		if err := json.NewDecoder(configFile).Decode(&model.Config); err != nil {
+			return nil, err
+		}
 	}
 
 	for _, layer := range manifest.Layers {
@@ -781,7 +783,7 @@ func PruneLayers() error {
 
 	err = deleteUnusedLayers(nil, deleteMap)
 	if err != nil {
-		slog.Info(fmt.Sprintf("couldn't remove unused layers: %v", err))
+		slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err))
 		return nil
 	}
 
@@ -839,7 +841,9 @@ func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 
 	var layers []*Layer
 	layers = append(layers, manifest.Layers...)
-	layers = append(layers, manifest.Config)
+	if manifest.Config.Digest != "" {
+		layers = append(layers, &manifest.Config)
+	}
 
 	for _, layer := range layers {
 		if err := uploadBlob(ctx, mp, layer, regOpts, fn); err != nil {
@@ -890,7 +894,9 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 			for _, l := range manifest.Layers {
 				deleteMap[l.Digest] = struct{}{}
 			}
-			deleteMap[manifest.Config.Digest] = struct{}{}
+			if manifest.Config.Digest != "" {
+				deleteMap[manifest.Config.Digest] = struct{}{}
+			}
 		}
 	}
 
@@ -907,7 +913,9 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 
 	var layers []*Layer
 	layers = append(layers, manifest.Layers...)
-	layers = append(layers, manifest.Config)
+	if manifest.Config.Digest != "" {
+		layers = append(layers, &manifest.Config)
+	}
 
 	skipVerify := make(map[string]bool)
 	for _, layer := range layers {
@@ -971,7 +979,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 		fn(api.ProgressResponse{Status: "removing any unused layers"})
 		err = deleteUnusedLayers(nil, deleteMap)
 		if err != nil {
-			slog.Info(fmt.Sprintf("couldn't remove unused layers: %v", err))
+			slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err))
 			fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't remove unused layers: %v", err)})
 		}
 	}
diff --git a/server/layer.go b/server/layer.go
index cc6709d2..a2b66782 100644
--- a/server/layer.go
+++ b/server/layer.go
@@ -2,6 +2,7 @@ package server
 
 import (
 	"crypto/sha256"
+	"errors"
 	"fmt"
 	"io"
 	"os"
@@ -61,6 +62,10 @@ func NewLayer(r io.Reader, mediatype string) (*Layer, error) {
 }
 
 func NewLayerFromLayer(digest, mediatype, from string) (*Layer, error) {
+	if digest == "" {
+		return nil, errors.New("creating new layer from layer with empty digest")
+	}
+
 	blob, err := GetBlobsPath(digest)
 	if err != nil {
 		return nil, err
@@ -81,6 +86,10 @@ func NewLayerFromLayer(digest, mediatype, from string) (*Layer, error) {
 }
 
 func (l *Layer) Open() (io.ReadSeekCloser, error) {
+	if l.Digest == "" {
+		return nil, errors.New("opening layer with empty digest")
+	}
+
 	blob, err := GetBlobsPath(l.Digest)
 	if err != nil {
 		return nil, err
@@ -90,13 +99,17 @@ func (l *Layer) Open() (io.ReadSeekCloser, error) {
 }
 
 func (l *Layer) Remove() error {
+	if l.Digest == "" {
+		return nil
+	}
+
 	ms, err := Manifests()
 	if err != nil {
 		return err
 	}
 
 	for _, m := range ms {
-		for _, layer := range append(m.Layers, m.Config) {
+		for _, layer := range append(m.Layers, &m.Config) {
 			if layer.Digest == l.Digest {
 				// something is using this layer
 				return nil
diff --git a/server/manifest.go b/server/manifest.go
index b8df11ef..b966ddbe 100644
--- a/server/manifest.go
+++ b/server/manifest.go
@@ -16,7 +16,7 @@ import (
 type Manifest struct {
 	SchemaVersion int      `json:"schemaVersion"`
 	MediaType     string   `json:"mediaType"`
-	Config        *Layer   `json:"config"`
+	Config        Layer    `json:"config"`
 	Layers        []*Layer `json:"layers"`
 
 	filepath string
@@ -25,7 +25,7 @@ type Manifest struct {
 }
 
 func (m *Manifest) Size() (size int64) {
-	for _, layer := range append(m.Layers, m.Config) {
+	for _, layer := range append(m.Layers, &m.Config) {
 		size += layer.Size
 	}
 
@@ -46,11 +46,13 @@ func (m *Manifest) Remove() error {
 }
 
 func (m *Manifest) RemoveLayers() error {
-	for _, layer := range append(m.Layers, m.Config) {
-		if err := layer.Remove(); errors.Is(err, os.ErrNotExist) {
-			slog.Debug("layer does not exist", "digest", layer.Digest)
-		} else if err != nil {
-			return err
+	for _, layer := range append(m.Layers, &m.Config) {
+		if layer.Digest != "" {
+			if err := layer.Remove(); errors.Is(err, os.ErrNotExist) {
+				slog.Debug("layer does not exist", "digest", layer.Digest)
+			} else if err != nil {
+				return err
+			}
 		}
 	}
 
@@ -113,7 +115,7 @@ func WriteManifest(name model.Name, config *Layer, layers []*Layer) error {
 	m := Manifest{
 		SchemaVersion: 2,
 		MediaType:     "application/vnd.docker.distribution.manifest.v2+json",
-		Config:        config,
+		Config:        *config,
 		Layers:        layers,
 	}
 
diff --git a/server/routes.go b/server/routes.go
index b9c66b65..e55eaa9d 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -824,17 +824,20 @@ func (s *Server) ListModelsHandler(c *gin.Context) {
 
 	models := []api.ListModelResponse{}
 	for n, m := range ms {
-		f, err := m.Config.Open()
-		if err != nil {
-			slog.Warn("bad manifest filepath", "name", n, "error", err)
-			continue
-		}
-		defer f.Close()
-
 		var cf ConfigV2
-		if err := json.NewDecoder(f).Decode(&cf); err != nil {
-			slog.Warn("bad manifest config", "name", n, "error", err)
-			continue
+
+		if m.Config.Digest != "" {
+			f, err := m.Config.Open()
+			if err != nil {
+				slog.Warn("bad manifest filepath", "name", n, "error", err)
+				continue
+			}
+			defer f.Close()
+
+			if err := json.NewDecoder(f).Decode(&cf); err != nil {
+				slog.Warn("bad manifest config", "name", n, "error", err)
+				continue
+			}
 		}
 
 		// tag should never be masked

From ad0c19dde403ba67aa27247775e33c33c30ee235 Mon Sep 17 00:00:00 2001
From: Kyle Kelley <rgbkrk@gmail.com>
Date: Wed, 7 Aug 2024 14:20:50 -0700
Subject: [PATCH 16/71] Use llama3.1 in tools example (#5985)

* Use llama3.1 in tools example

* Update api.md
---
 docs/api.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/api.md b/docs/api.md
index c0202ef3..5cbba523 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -669,7 +669,7 @@ curl http://localhost:11434/api/chat -d '{
 
 ```
 curl http://localhost:11434/api/chat -d '{
-  "model": "mistral",
+  "model": "llama3.1",
   "messages": [
     {
       "role": "user",
@@ -708,7 +708,7 @@ curl http://localhost:11434/api/chat -d '{
 
 ```json
 {
-  "model": "mistral:7b-instruct-v0.3-q4_K_M",
+  "model": "llama3.1",
   "created_at": "2024-07-22T20:33:28.123648Z",
   "message": {
     "role": "assistant",

From 5b3a21b578da89b1682a98ce123a6b3c91697e9b Mon Sep 17 00:00:00 2001
From: royjhan <65097070+royjhan@users.noreply.github.com>
Date: Wed, 7 Aug 2024 17:43:44 -0400
Subject: [PATCH 17/71] add metrics to docs (#6079)

---
 docs/api.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/api.md b/docs/api.md
index 5cbba523..aed2b69f 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -1175,7 +1175,10 @@ curl http://localhost:11434/api/embed -d '{
   "embeddings": [[
     0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
     0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
-  ]]
+  ]],
+  "total_duration": 14143917,
+  "load_duration": 1019500,
+  "prompt_eval_count": 8
 }
 ```
 

From 97ec8cfd4ef13190f3939fbb24b6f146d570ed12 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Wed, 7 Aug 2024 11:44:25 -0700
Subject: [PATCH 18/71] image: Clarify argument to WriteManifest is config

When creating a model the config layer is appended to the list of
layers and then the last layer is used as the config when writing the
manifest. This change directly uses the config layer to write the
manifest. There is no behavior change but it is less error prone.
---
 server/images.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/server/images.go b/server/images.go
index 7ed35995..4202a413 100644
--- a/server/images.go
+++ b/server/images.go
@@ -625,12 +625,12 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 		return err
 	}
 
-	layer, err := NewLayer(&b, "application/vnd.docker.container.image.v1+json")
+	configLayer, err := NewLayer(&b, "application/vnd.docker.container.image.v1+json")
 	if err != nil {
 		return err
 	}
 
-	for _, layer := range append(layers, layer) {
+	for _, layer := range append(layers, configLayer) {
 		if layer.status != "" {
 			fn(api.ProgressResponse{Status: layer.status})
 		}
@@ -639,7 +639,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 	old, _ := ParseNamedManifest(name)
 
 	fn(api.ProgressResponse{Status: "writing manifest"})
-	if err := WriteManifest(name, layer, layers); err != nil {
+	if err := WriteManifest(name, configLayer, layers); err != nil {
 		return err
 	}
 

From 7edaf6e7e8d79a9c88419988ae98afaf3fc32f15 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Wed, 7 Aug 2024 14:22:17 -0700
Subject: [PATCH 19/71] manifest: Store layers inside manifests consistently as
 values.

Commit 1829fb61 ("manifest: Fix crash on startup when trying to clean up
unused files (#5840)") changed the config layer stored in manifests
from a pointer to a value. This was done in order to avoid potential
nil pointer dereferences after it is deserialized from JSON in the
event that the field is missing.

This changes the Layers slice to also be stored by value. This enables
consistency in handling across the two objects.
---
 server/images.go             | 14 +++++++-------
 server/layer.go              | 28 ++++++++++++++--------------
 server/manifest.go           | 16 ++++++++--------
 server/model.go              |  2 +-
 server/routes_delete_test.go |  2 +-
 server/upload.go             |  4 ++--
 6 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/server/images.go b/server/images.go
index 4202a413..0e753f56 100644
--- a/server/images.go
+++ b/server/images.go
@@ -373,7 +373,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 	var messages []*api.Message
 	parameters := make(map[string]any)
 
-	var layers []*Layer
+	var layers []Layer
 	for _, c := range modelfile.Commands {
 		mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name)
 
@@ -499,7 +499,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 
 			if c.Name != "license" {
 				// replace
-				layers = slices.DeleteFunc(layers, func(layer *Layer) bool {
+				layers = slices.DeleteFunc(layers, func(layer Layer) bool {
 					if layer.MediaType != mediatype {
 						return false
 					}
@@ -545,7 +545,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 	}
 
 	var err2 error
-	layers = slices.DeleteFunc(layers, func(layer *Layer) bool {
+	layers = slices.DeleteFunc(layers, func(layer Layer) bool {
 		switch layer.MediaType {
 		case "application/vnd.ollama.image.message":
 			// if there are new messages, remove the inherited ones
@@ -839,10 +839,10 @@ func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 		return err
 	}
 
-	var layers []*Layer
+	var layers []Layer
 	layers = append(layers, manifest.Layers...)
 	if manifest.Config.Digest != "" {
-		layers = append(layers, &manifest.Config)
+		layers = append(layers, manifest.Config)
 	}
 
 	for _, layer := range layers {
@@ -911,10 +911,10 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 		return fmt.Errorf("pull model manifest: %s", err)
 	}
 
-	var layers []*Layer
+	var layers []Layer
 	layers = append(layers, manifest.Layers...)
 	if manifest.Config.Digest != "" {
-		layers = append(layers, &manifest.Config)
+		layers = append(layers, manifest.Config)
 	}
 
 	skipVerify := make(map[string]bool)
diff --git a/server/layer.go b/server/layer.go
index a2b66782..c666bd10 100644
--- a/server/layer.go
+++ b/server/layer.go
@@ -16,15 +16,15 @@ type Layer struct {
 	status    string
 }
 
-func NewLayer(r io.Reader, mediatype string) (*Layer, error) {
+func NewLayer(r io.Reader, mediatype string) (Layer, error) {
 	blobs, err := GetBlobsPath("")
 	if err != nil {
-		return nil, err
+		return Layer{}, err
 	}
 
 	temp, err := os.CreateTemp(blobs, "sha256-")
 	if err != nil {
-		return nil, err
+		return Layer{}, err
 	}
 	defer temp.Close()
 	defer os.Remove(temp.Name())
@@ -32,28 +32,28 @@ func NewLayer(r io.Reader, mediatype string) (*Layer, error) {
 	sha256sum := sha256.New()
 	n, err := io.Copy(io.MultiWriter(temp, sha256sum), r)
 	if err != nil {
-		return nil, err
+		return Layer{}, err
 	}
 
 	if err := temp.Close(); err != nil {
-		return nil, err
+		return Layer{}, err
 	}
 
 	digest := fmt.Sprintf("sha256:%x", sha256sum.Sum(nil))
 	blob, err := GetBlobsPath(digest)
 	if err != nil {
-		return nil, err
+		return Layer{}, err
 	}
 
 	status := "using existing layer"
 	if _, err := os.Stat(blob); err != nil {
 		status = "creating new layer"
 		if err := os.Rename(temp.Name(), blob); err != nil {
-			return nil, err
+			return Layer{}, err
 		}
 	}
 
-	return &Layer{
+	return Layer{
 		MediaType: mediatype,
 		Digest:    digest,
 		Size:      n,
@@ -61,22 +61,22 @@ func NewLayer(r io.Reader, mediatype string) (*Layer, error) {
 	}, nil
 }
 
-func NewLayerFromLayer(digest, mediatype, from string) (*Layer, error) {
+func NewLayerFromLayer(digest, mediatype, from string) (Layer, error) {
 	if digest == "" {
-		return nil, errors.New("creating new layer from layer with empty digest")
+		return Layer{}, errors.New("creating new layer from layer with empty digest")
 	}
 
 	blob, err := GetBlobsPath(digest)
 	if err != nil {
-		return nil, err
+		return Layer{}, err
 	}
 
 	fi, err := os.Stat(blob)
 	if err != nil {
-		return nil, err
+		return Layer{}, err
 	}
 
-	return &Layer{
+	return Layer{
 		MediaType: mediatype,
 		Digest:    digest,
 		Size:      fi.Size(),
@@ -109,7 +109,7 @@ func (l *Layer) Remove() error {
 	}
 
 	for _, m := range ms {
-		for _, layer := range append(m.Layers, &m.Config) {
+		for _, layer := range append(m.Layers, m.Config) {
 			if layer.Digest == l.Digest {
 				// something is using this layer
 				return nil
diff --git a/server/manifest.go b/server/manifest.go
index b966ddbe..6a5d7b88 100644
--- a/server/manifest.go
+++ b/server/manifest.go
@@ -14,10 +14,10 @@ import (
 )
 
 type Manifest struct {
-	SchemaVersion int      `json:"schemaVersion"`
-	MediaType     string   `json:"mediaType"`
-	Config        Layer    `json:"config"`
-	Layers        []*Layer `json:"layers"`
+	SchemaVersion int     `json:"schemaVersion"`
+	MediaType     string  `json:"mediaType"`
+	Config        Layer   `json:"config"`
+	Layers        []Layer `json:"layers"`
 
 	filepath string
 	fi       os.FileInfo
@@ -25,7 +25,7 @@ type Manifest struct {
 }
 
 func (m *Manifest) Size() (size int64) {
-	for _, layer := range append(m.Layers, &m.Config) {
+	for _, layer := range append(m.Layers, m.Config) {
 		size += layer.Size
 	}
 
@@ -46,7 +46,7 @@ func (m *Manifest) Remove() error {
 }
 
 func (m *Manifest) RemoveLayers() error {
-	for _, layer := range append(m.Layers, &m.Config) {
+	for _, layer := range append(m.Layers, m.Config) {
 		if layer.Digest != "" {
 			if err := layer.Remove(); errors.Is(err, os.ErrNotExist) {
 				slog.Debug("layer does not exist", "digest", layer.Digest)
@@ -95,7 +95,7 @@ func ParseNamedManifest(n model.Name) (*Manifest, error) {
 	return &m, nil
 }
 
-func WriteManifest(name model.Name, config *Layer, layers []*Layer) error {
+func WriteManifest(name model.Name, config Layer, layers []Layer) error {
 	manifests, err := GetManifestPath()
 	if err != nil {
 		return err
@@ -115,7 +115,7 @@ func WriteManifest(name model.Name, config *Layer, layers []*Layer) error {
 	m := Manifest{
 		SchemaVersion: 2,
 		MediaType:     "application/vnd.docker.distribution.manifest.v2+json",
-		Config:        *config,
+		Config:        config,
 		Layers:        layers,
 	}
 
diff --git a/server/model.go b/server/model.go
index f2946a0b..ad6e4e55 100644
--- a/server/model.go
+++ b/server/model.go
@@ -26,7 +26,7 @@ import (
 var intermediateBlobs map[string]string = make(map[string]string)
 
 type layerGGML struct {
-	*Layer
+	Layer
 	*llm.GGML
 }
 
diff --git a/server/routes_delete_test.go b/server/routes_delete_test.go
index 1c950d66..82fac9f5 100644
--- a/server/routes_delete_test.go
+++ b/server/routes_delete_test.go
@@ -98,7 +98,7 @@ func TestDeleteDuplicateLayers(t *testing.T) {
 	}
 
 	// create a manifest with duplicate layers
-	if err := WriteManifest(n, config, []*Layer{config}); err != nil {
+	if err := WriteManifest(n, config, []Layer{config}); err != nil {
 		t.Fatal(err)
 	}
 
diff --git a/server/upload.go b/server/upload.go
index b5a244ea..2f115436 100644
--- a/server/upload.go
+++ b/server/upload.go
@@ -26,7 +26,7 @@ import (
 var blobUploadManager sync.Map
 
 type blobUpload struct {
-	*Layer
+	Layer
 
 	Total     int64
 	Completed atomic.Int64
@@ -362,7 +362,7 @@ func (p *progressWriter) Rollback() {
 	p.written = 0
 }
 
-func uploadBlob(ctx context.Context, mp ModelPath, layer *Layer, opts *registryOptions, fn func(api.ProgressResponse)) error {
+func uploadBlob(ctx context.Context, mp ModelPath, layer Layer, opts *registryOptions, fn func(api.ProgressResponse)) error {
 	requestURL := mp.BaseURL()
 	requestURL = requestURL.JoinPath("v2", mp.GetNamespaceRepository(), "blobs", layer.Digest)
 

From 7b61eba47159748bcfc35227a13e31c899a84e49 Mon Sep 17 00:00:00 2001
From: Jitang Lei <leijitang@outlook.com>
Date: Thu, 8 Aug 2024 20:28:01 +0800
Subject: [PATCH 20/71] server/download.go: Fix a typo in log

Signed-off-by: Jitang Lei <leijitang@outlook.com>
---
 server/download.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/download.go b/server/download.go
index 38d24a6b..cf31df5e 100644
--- a/server/download.go
+++ b/server/download.go
@@ -235,7 +235,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 
 			newOpts.CheckRedirect = func(req *http.Request, via []*http.Request) error {
 				if len(via) > 10 {
-					return errors.New("maxium redirects exceeded (10) for directURL")
+					return errors.New("maximum redirects exceeded (10) for directURL")
 				}
 
 				// if the hostname is the same, allow the redirect

From 67472e0e89f516ccfbfad2d11414aadf484b7642 Mon Sep 17 00:00:00 2001
From: Nicholas42 <nics-lohr@gmx.de>
Date: Fri, 9 Aug 2024 13:41:20 +0200
Subject: [PATCH 21/71] Also flag *.icns as binary

---
 .gitattributes | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitattributes b/.gitattributes
index 648c78ca..baabd3c1 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -2,3 +2,4 @@ llm/ext_server/* linguist-vendored
 * text eol=lf
 *.png binary
 *.ico binary
+*.icns binary

From 5bca2e60a7baefe582077469a1d14ff516b5d322 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Fri, 9 Aug 2024 11:31:38 -0700
Subject: [PATCH 22/71] Harden intel boostrap for nil pointers

---
 gpu/gpu.go | 61 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 32 insertions(+), 29 deletions(-)

diff --git a/gpu/gpu.go b/gpu/gpu.go
index 7ae8fbec..dc124a3e 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -305,38 +305,41 @@ func GetGPUInfo() GpuInfoList {
 		// Intel
 		if envconfig.IntelGPU() {
 			oHandles = initOneAPIHandles()
-			// On windows we bundle the oneapi library one level above the runner dir
-			depPath = ""
-			if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
-				depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "oneapi")
-			}
+			if oHandles != nil && oHandles.oneapi != nil {
 
-			for d := range oHandles.oneapi.num_drivers {
-				if oHandles.oneapi == nil {
-					// shouldn't happen
-					slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers))
-					continue
+				// On windows we bundle the oneapi library one level above the runner dir
+				depPath = ""
+				if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
+					depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "oneapi")
 				}
-				devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
-				for i := range devCount {
-					gpuInfo := OneapiGPUInfo{
-						GpuInfo: GpuInfo{
-							Library: "oneapi",
-						},
-						driverIndex: int(d),
-						gpuIndex:    int(i),
+
+				for d := range oHandles.oneapi.num_drivers {
+					if oHandles.oneapi == nil {
+						// shouldn't happen
+						slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers))
+						continue
+					}
+					devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
+					for i := range devCount {
+						gpuInfo := OneapiGPUInfo{
+							GpuInfo: GpuInfo{
+								Library: "oneapi",
+							},
+							driverIndex: int(d),
+							gpuIndex:    int(i),
+						}
+						// TODO - split bootstrapping from updating free memory
+						C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
+						// TODO - convert this to MinimumMemory based on testing...
+						var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
+						memInfo.free = C.uint64_t(totalFreeMem)
+						gpuInfo.TotalMemory = uint64(memInfo.total)
+						gpuInfo.FreeMemory = uint64(memInfo.free)
+						gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
+						gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
+						gpuInfo.DependencyPath = depPath
+						oneapiGPUs = append(oneapiGPUs, gpuInfo)
 					}
-					// TODO - split bootstrapping from updating free memory
-					C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
-					// TODO - convert this to MinimumMemory based on testing...
-					var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
-					memInfo.free = C.uint64_t(totalFreeMem)
-					gpuInfo.TotalMemory = uint64(memInfo.total)
-					gpuInfo.FreeMemory = uint64(memInfo.free)
-					gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
-					gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-					gpuInfo.DependencyPath = depPath
-					oneapiGPUs = append(oneapiGPUs, gpuInfo)
 				}
 			}
 		}

From 2fa1db434581bcfcb6fec1482904656e4b5f8313 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Fri, 9 Aug 2024 11:57:48 -0700
Subject: [PATCH 23/71] Don't hard fail on sparse setup error

It seems this can fail in some casees, but proceed
with the download anyway.
---
 server/download.go       | 4 +---
 server/sparse_common.go  | 3 +--
 server/sparse_windows.go | 5 +++--
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/server/download.go b/server/download.go
index 38d24a6b..5965b322 100644
--- a/server/download.go
+++ b/server/download.go
@@ -216,9 +216,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 		return err
 	}
 	defer file.Close()
-	if err := setSparse(file); err != nil {
-		return err
-	}
+	setSparse(file)
 
 	_ = file.Truncate(b.Total)
 
diff --git a/server/sparse_common.go b/server/sparse_common.go
index f25627fc..c88b2da0 100644
--- a/server/sparse_common.go
+++ b/server/sparse_common.go
@@ -4,6 +4,5 @@ package server
 
 import "os"
 
-func setSparse(file *os.File) error {
-	return nil
+func setSparse(*os.File) {
 }
diff --git a/server/sparse_windows.go b/server/sparse_windows.go
index cdad379e..f21cbbda 100644
--- a/server/sparse_windows.go
+++ b/server/sparse_windows.go
@@ -6,8 +6,9 @@ import (
 	"golang.org/x/sys/windows"
 )
 
-func setSparse(file *os.File) error {
-	return windows.DeviceIoControl(
+func setSparse(file *os.File) {
+	// exFat (and other FS types) don't support sparse files, so ignore errors
+	windows.DeviceIoControl( //nolint:errcheck
 		windows.Handle(file.Fd()), windows.FSCTL_SET_SPARSE,
 		nil, 0,
 		nil, 0,

From d4e640746469ac586f12b400384c4ae7354d9280 Mon Sep 17 00:00:00 2001
From: Nicholas Schwab <git@nicholas-schwab.de>
Date: Fri, 9 Aug 2024 23:14:13 +0200
Subject: [PATCH 24/71] Restrict text files with explicit line feeds to *.go.

This partially reverts b732beba6a919b852539bb344b05e25c6a7c3c90. It
seems like explicitly setting all files to use line feeds was done due
to issues with the go linter, hence it can be restricted to those files
(https://github.com/ollama/ollama/pull/6235#issuecomment-2278745953).
---
 .gitattributes | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/.gitattributes b/.gitattributes
index baabd3c1..f1c8bcb4 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,5 +1,3 @@
 llm/ext_server/* linguist-vendored
-* text eol=lf
-*.png binary
-*.ico binary
-*.icns binary
+* text=auto
+*.go text eol=lf

From 023451ce471e7781bee65505011c48b9e5541811 Mon Sep 17 00:00:00 2001
From: CognitiveTech <cognitivetechniq@gmail.com>
Date: Sat, 10 Aug 2024 21:43:08 -0400
Subject: [PATCH 25/71] add  integration obook-summary (#6305)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 7c606e1c..aae92e6c 100644
--- a/README.md
+++ b/README.md
@@ -325,6 +325,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [tlm](https://github.com/yusufcanb/tlm)
 - [podman-ollama](https://github.com/ericcurtin/podman-ollama)
 - [gollama](https://github.com/sammcj/gollama)
+- [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
 
 ### Database
 

From 25906d72d1482bc9dc2e4300a42c8db4823ee1a3 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Sun, 11 Aug 2024 11:30:20 -0700
Subject: [PATCH 26/71] llm: prevent loading too large models on windows
 (#5926)

Don't allow loading models that would lead to memory exhaustion (across vram, system memory and disk paging). This check was already applied on Linux but should also be applied on Windows as well.
---
 llm/server.go | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llm/server.go b/llm/server.go
index 41736068..0bd94f35 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -125,8 +125,9 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}
 	}
 
-	// On linux, over-allocating CPU memory will almost always result in an error
-	if runtime.GOOS == "linux" {
+	// On linux and windows, over-allocating CPU memory will almost always result in an error
+	// Darwin has fully dynamic swap so has no direct concept of free swap space
+	if runtime.GOOS != "darwin" {
 		systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize
 		available := systemFreeMemory + systemSwapFreeMemory
 		if systemMemoryRequired > available {

From 15c2d8fe149ba2b58aadbab615a6955f8821c7a9 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Sun, 11 Aug 2024 11:57:10 -0700
Subject: [PATCH 27/71] server: parallelize embeddings in API web handler
 instead of in subprocess runner (#6220)

For simplicity, perform parallelization of embedding requests in the API handler instead of offloading this to the subprocess runner. This keeps the scheduling story simpler as it builds on existing parallel requests, similar to existing text completion functionality.
---
 llm/ext_server/server.cpp | 42 ++++++++-------------------------------
 llm/server.go             | 32 +++++++++++++----------------
 server/routes.go          | 42 +++++++++++++++++++++++++--------------
 server/sched_test.go      |  8 ++++----
 4 files changed, 53 insertions(+), 71 deletions(-)

diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp
index c65901c7..5717c17a 100644
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -1223,9 +1223,7 @@ struct llama_server_context
 
                 res.result_json = json
                 {
-                    {"id", res.id},
                     {"embedding", std::vector<float>(embd, embd + n_embd)},
-                    {"timings",             slot.get_formated_timings()},
                 };
             }
         }
@@ -3194,41 +3192,17 @@ int main(int argc, char **argv) {
                     prompt = "";
                 }
 
-                if (prompt.size() == 1) {
-                    prompt = prompt[0];
-                }
-
                 // create and queue the task
-                json responses;
-                {
-                    const int id_task = llama.queue_tasks.get_new_id();
-                    llama.queue_results.add_waiting_task_id(id_task);
-                    llama.request_completion(id_task, {{"prompt", prompt}}, true, -1);
+                const int task_id = llama.queue_tasks.get_new_id();
+                llama.queue_results.add_waiting_task_id(task_id);
+                llama.request_completion(task_id, {{"prompt", prompt}}, true, -1);
 
-                    // get the result
-                    task_result result = llama.queue_results.recv(id_task);
-                    llama.queue_results.remove_waiting_task_id(id_task);
-                    if (result.error) {
-                        return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
-                    }
+                // get the result
+                task_result result = llama.queue_results.recv(task_id);
+                llama.queue_results.remove_waiting_task_id(task_id);
 
-                    responses = result.result_json.value("results", std::vector<json>{result.result_json});
-                    std::sort(responses.begin(), responses.end(), [](const json& a, const json& b) {
-                        return a["id"] < b["id"];
-                    });
-
-                    json embeddings = json::array();
-
-                    int prompt_n = 0;
-                    for (auto & elem : responses) {
-                        embeddings.push_back(elem.at("embedding"));
-                        prompt_n += elem.at("timings").at("prompt_n").get<int>();
-                    }
-
-                    // send the result
-                    json embedding_res = json{{"embedding", embeddings}, {"prompt_n", prompt_n}};
-                    return res.set_content(embedding_res.dump(), "application/json; charset=utf-8");
-                }
+                // send the result
+                return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
             });
 
     // GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!?
diff --git a/llm/server.go b/llm/server.go
index 0bd94f35..d2b8db9b 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -33,7 +33,7 @@ type LlamaServer interface {
 	Ping(ctx context.Context) error
 	WaitUntilRunning(ctx context.Context) error
 	Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
-	Embed(ctx context.Context, input []string) (*EmbedResponse, error)
+	Embedding(ctx context.Context, input string) ([]float32, error)
 	Tokenize(ctx context.Context, content string) ([]int, error)
 	Detokenize(ctx context.Context, tokens []int) (string, error)
 	Close() error
@@ -883,24 +883,20 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 	return nil
 }
 
-type EmbedRequest struct {
-	Content []string `json:"content"`
+type EmbeddingRequest struct {
+	Content string `json:"content"`
 }
 
-type EmbedResponse struct {
-	Embedding       [][]float32 `json:"embedding"`
-	PromptEvalCount int         `json:"prompt_n"`
+type EmbeddingResponse struct {
+	Embedding []float32 `json:"embedding"`
 }
 
-func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse, error) {
-	// each input will use a slot, so we need to acquire the semaphore for
-	// the number of inputs up to numParallel
-	slots := int64(min(len(input), s.numParallel))
-	if err := s.sem.Acquire(ctx, slots); err != nil {
+func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) {
+	if err := s.sem.Acquire(ctx, 1); err != nil {
 		slog.Error("Failed to acquire semaphore", "error", err)
 		return nil, err
 	}
-	defer s.sem.Release(slots)
+	defer s.sem.Release(1)
 
 	// Make sure the server is ready
 	status, err := s.getServerStatusRetry(ctx)
@@ -910,18 +906,18 @@ func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse,
 		return nil, fmt.Errorf("unexpected server status: %s", status.ToString())
 	}
 
-	data, err := json.Marshal(EmbedRequest{Content: input})
+	data, err := json.Marshal(EmbeddingRequest{Content: input})
 	if err != nil {
 		return nil, fmt.Errorf("error marshaling embed data: %w", err)
 	}
 
-	req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/embedding", s.port), bytes.NewBuffer(data))
+	r, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/embedding", s.port), bytes.NewBuffer(data))
 	if err != nil {
 		return nil, fmt.Errorf("error creating embed request: %w", err)
 	}
-	req.Header.Set("Content-Type", "application/json")
+	r.Header.Set("Content-Type", "application/json")
 
-	resp, err := http.DefaultClient.Do(req)
+	resp, err := http.DefaultClient.Do(r)
 	if err != nil {
 		return nil, fmt.Errorf("do embedding request: %w", err)
 	}
@@ -937,12 +933,12 @@ func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse,
 		return nil, fmt.Errorf("%s", body)
 	}
 
-	var e EmbedResponse
+	var e EmbeddingResponse
 	if err := json.Unmarshal(body, &e); err != nil {
 		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
 	}
 
-	return &e, nil
+	return e.Embedding, nil
 }
 
 type TokenizeRequest struct {
diff --git a/server/routes.go b/server/routes.go
index e55eaa9d..e5a31002 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -23,6 +23,7 @@ import (
 
 	"github.com/gin-contrib/cors"
 	"github.com/gin-gonic/gin"
+	"golang.org/x/sync/errgroup"
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
@@ -346,6 +347,7 @@ func (s *Server) EmbedHandler(c *gin.Context) {
 		return
 	}
 
+	var count int
 	for i, s := range input {
 		tokens, err := r.Tokenize(c.Request.Context(), s)
 		if err != nil {
@@ -368,25 +370,36 @@ func (s *Server) EmbedHandler(c *gin.Context) {
 			}
 		}
 
+		count += len(tokens)
+
 		input[i] = s
 	}
-	embeddings, err := r.Embed(c.Request.Context(), input)
-	if err != nil {
-		slog.Error("embedding generation failed", "error", err)
-		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"})
-		return
+
+	var g errgroup.Group
+	embeddings := make([][]float32, len(input))
+	for i, text := range input {
+		g.Go(func() error {
+			embedding, err := r.Embedding(c.Request.Context(), text)
+			if err != nil {
+				return err
+			}
+			embeddings[i] = normalize(embedding)
+			return nil
+		})
 	}
 
-	for i, e := range embeddings.Embedding {
-		embeddings.Embedding[i] = normalize(e)
+	if err := g.Wait(); err != nil {
+		slog.Error("embedding generation failed", "error", err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": fmt.Errorf("failed to generate embeddings: %v", err)})
+		return
 	}
 
 	resp := api.EmbedResponse{
 		Model:           req.Model,
-		Embeddings:      embeddings.Embedding,
+		Embeddings:      embeddings,
 		TotalDuration:   time.Since(checkpointStart),
 		LoadDuration:    checkpointLoaded.Sub(checkpointStart),
-		PromptEvalCount: embeddings.PromptEvalCount,
+		PromptEvalCount: count,
 	}
 	c.JSON(http.StatusOK, resp)
 }
@@ -430,21 +443,20 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
 		return
 	}
 
-	embeddings, err := r.Embed(c.Request.Context(), []string{req.Prompt})
+	embedding, err := r.Embedding(c.Request.Context(), req.Prompt)
 	if err != nil {
 		slog.Info(fmt.Sprintf("embedding generation failed: %v", err))
 		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"})
 		return
 	}
 
-	embedding := make([]float64, len(embeddings.Embedding[0]))
-
-	for i, v := range embeddings.Embedding[0] {
-		embedding[i] = float64(v)
+	var e []float64
+	for _, v := range embedding {
+		e = append(e, float64(v))
 	}
 
 	resp := api.EmbeddingResponse{
-		Embedding: embedding,
+		Embedding: e,
 	}
 	c.JSON(http.StatusOK, resp)
 }
diff --git a/server/sched_test.go b/server/sched_test.go
index c8717430..713b9259 100644
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -708,8 +708,8 @@ type mockLlm struct {
 	pingResp           error
 	waitResp           error
 	completionResp     error
-	embedResp          *llm.EmbedResponse
-	embedRespErr       error
+	embeddingResp      []float32
+	embeddingRespErr   error
 	tokenizeResp       []int
 	tokenizeRespErr    error
 	detokenizeResp     string
@@ -727,8 +727,8 @@ func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn
 	return s.completionResp
 }
 
-func (s *mockLlm) Embed(ctx context.Context, input []string) (*llm.EmbedResponse, error) {
-	return s.embedResp, s.embedRespErr
+func (s *mockLlm) Embedding(ctx context.Context, input string) ([]float32, error) {
+	return s.embeddingResp, s.embeddingRespErr
 }
 
 func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {

From 8aac22438ef34192ff804dbeb1b5e9a7e180eb7c Mon Sep 17 00:00:00 2001
From: Josh <76125168+joshyan1@users.noreply.github.com>
Date: Mon, 12 Aug 2024 09:28:55 -0700
Subject: [PATCH 28/71] server: speed up single gguf creates (#5898)

---
 server/model.go      | 17 +++++++--
 server/model_test.go | 82 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 96 insertions(+), 3 deletions(-)

diff --git a/server/model.go b/server/model.go
index ad6e4e55..8d7ed7e6 100644
--- a/server/model.go
+++ b/server/model.go
@@ -176,9 +176,20 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
 			mediatype = "application/vnd.ollama.image.projector"
 		}
 
-		layer, err := NewLayer(io.NewSectionReader(file, offset, n), mediatype)
-		if err != nil {
-			return nil, err
+		var layer *Layer
+		if digest != "" && n == stat.Size() && offset == 0 {
+			layer, err = NewLayerFromLayer(digest, mediatype, file.Name())
+			if err != nil {
+				slog.Debug("could not create new layer from layer", "error", err)
+			}
+		}
+
+		// Fallback to creating layer from file copy (either NewLayerFromLayer failed, or digest empty/n != stat.Size())
+		if layer == nil {
+			layer, err = NewLayer(io.NewSectionReader(file, offset, n), mediatype)
+			if err != nil {
+				return nil, err
+			}
 		}
 
 		layers = append(layers, &layerGGML{layer, ggml})
diff --git a/server/model_test.go b/server/model_test.go
index aa214d3d..63fc408d 100644
--- a/server/model_test.go
+++ b/server/model_test.go
@@ -2,8 +2,10 @@ package server
 
 import (
 	"bytes"
+	"context"
 	"encoding/json"
 	"fmt"
+	"io"
 	"os"
 	"path/filepath"
 	"testing"
@@ -11,6 +13,7 @@ import (
 	"github.com/google/go-cmp/cmp"
 
 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/template"
 )
 
@@ -133,3 +136,82 @@ The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`,
 		})
 	}
 }
+
+func TestParseFromFileFromLayer(t *testing.T) {
+	tempModels := t.TempDir()
+
+	file, err := os.CreateTemp(tempModels, "")
+	if err != nil {
+		t.Fatalf("failed to open file: %v", err)
+	}
+	defer file.Close()
+	if err := llm.WriteGGUF(file, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil {
+		t.Fatalf("failed to write gguf: %v", err)
+	}
+
+	if _, err := file.Seek(0, io.SeekStart); err != nil {
+		t.Fatalf("failed to seek to start: %v", err)
+	}
+
+	layers, err := parseFromFile(context.Background(), file, "", func(api.ProgressResponse) {})
+	if err != nil {
+		t.Fatalf("failed to parse from file: %v", err)
+	}
+
+	if len(layers) != 1 {
+		t.Fatalf("got %d != want 1", len(layers))
+	}
+
+	if _, err := file.Seek(0, io.SeekStart); err != nil {
+		t.Fatalf("failed to seek to start: %v", err)
+	}
+
+	layers2, err := parseFromFile(context.Background(), file, layers[0].Digest, func(api.ProgressResponse) {})
+	if err != nil {
+		t.Fatalf("failed to parse from file: %v", err)
+	}
+	if len(layers2) != 1 {
+		t.Fatalf("got %d != want 1", len(layers2))
+	}
+
+	if layers[0].Digest != layers2[0].Digest {
+		t.Fatalf("got %s != want %s", layers[0].Digest, layers2[0].Digest)
+	}
+
+	if layers[0].Size != layers2[0].Size {
+		t.Fatalf("got %d != want %d", layers[0].Size, layers2[0].Size)
+	}
+
+	if layers[0].MediaType != layers2[0].MediaType {
+		t.Fatalf("got %v != want %v", layers[0].MediaType, layers2[0].MediaType)
+	}
+}
+
+func TestParseLayerFromCopy(t *testing.T) {
+	tempModels := t.TempDir()
+
+	file2, err := os.CreateTemp(tempModels, "")
+	if err != nil {
+		t.Fatalf("failed to open file: %v", err)
+	}
+	defer file2.Close()
+
+	for range 5 {
+		if err := llm.WriteGGUF(file2, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil {
+			t.Fatalf("failed to write gguf: %v", err)
+		}
+	}
+
+	if _, err := file2.Seek(0, io.SeekStart); err != nil {
+		t.Fatalf("failed to seek to start: %v", err)
+	}
+
+	layers, err := parseFromFile(context.Background(), file2, "", func(api.ProgressResponse) {})
+	if err != nil {
+		t.Fatalf("failed to parse from file: %v", err)
+	}
+
+	if len(layers) != 5 {
+		t.Fatalf("got %d != want 5", len(layers))
+	}
+}

From 1dc3ef3aa9d451a63fcb6ea2e1b6ea5289a1a325 Mon Sep 17 00:00:00 2001
From: Josh <76125168+joshyan1@users.noreply.github.com>
Date: Mon, 12 Aug 2024 09:57:51 -0700
Subject: [PATCH 29/71] Revert "server: speed up single gguf creates (#5898)"
 (#6323)

This reverts commit 8aac22438ef34192ff804dbeb1b5e9a7e180eb7c.
---
 server/model.go      | 17 ++-------
 server/model_test.go | 82 --------------------------------------------
 2 files changed, 3 insertions(+), 96 deletions(-)

diff --git a/server/model.go b/server/model.go
index 8d7ed7e6..ad6e4e55 100644
--- a/server/model.go
+++ b/server/model.go
@@ -176,20 +176,9 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
 			mediatype = "application/vnd.ollama.image.projector"
 		}
 
-		var layer *Layer
-		if digest != "" && n == stat.Size() && offset == 0 {
-			layer, err = NewLayerFromLayer(digest, mediatype, file.Name())
-			if err != nil {
-				slog.Debug("could not create new layer from layer", "error", err)
-			}
-		}
-
-		// Fallback to creating layer from file copy (either NewLayerFromLayer failed, or digest empty/n != stat.Size())
-		if layer == nil {
-			layer, err = NewLayer(io.NewSectionReader(file, offset, n), mediatype)
-			if err != nil {
-				return nil, err
-			}
+		layer, err := NewLayer(io.NewSectionReader(file, offset, n), mediatype)
+		if err != nil {
+			return nil, err
 		}
 
 		layers = append(layers, &layerGGML{layer, ggml})
diff --git a/server/model_test.go b/server/model_test.go
index 63fc408d..aa214d3d 100644
--- a/server/model_test.go
+++ b/server/model_test.go
@@ -2,10 +2,8 @@ package server
 
 import (
 	"bytes"
-	"context"
 	"encoding/json"
 	"fmt"
-	"io"
 	"os"
 	"path/filepath"
 	"testing"
@@ -13,7 +11,6 @@ import (
 	"github.com/google/go-cmp/cmp"
 
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/template"
 )
 
@@ -136,82 +133,3 @@ The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`,
 		})
 	}
 }
-
-func TestParseFromFileFromLayer(t *testing.T) {
-	tempModels := t.TempDir()
-
-	file, err := os.CreateTemp(tempModels, "")
-	if err != nil {
-		t.Fatalf("failed to open file: %v", err)
-	}
-	defer file.Close()
-	if err := llm.WriteGGUF(file, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil {
-		t.Fatalf("failed to write gguf: %v", err)
-	}
-
-	if _, err := file.Seek(0, io.SeekStart); err != nil {
-		t.Fatalf("failed to seek to start: %v", err)
-	}
-
-	layers, err := parseFromFile(context.Background(), file, "", func(api.ProgressResponse) {})
-	if err != nil {
-		t.Fatalf("failed to parse from file: %v", err)
-	}
-
-	if len(layers) != 1 {
-		t.Fatalf("got %d != want 1", len(layers))
-	}
-
-	if _, err := file.Seek(0, io.SeekStart); err != nil {
-		t.Fatalf("failed to seek to start: %v", err)
-	}
-
-	layers2, err := parseFromFile(context.Background(), file, layers[0].Digest, func(api.ProgressResponse) {})
-	if err != nil {
-		t.Fatalf("failed to parse from file: %v", err)
-	}
-	if len(layers2) != 1 {
-		t.Fatalf("got %d != want 1", len(layers2))
-	}
-
-	if layers[0].Digest != layers2[0].Digest {
-		t.Fatalf("got %s != want %s", layers[0].Digest, layers2[0].Digest)
-	}
-
-	if layers[0].Size != layers2[0].Size {
-		t.Fatalf("got %d != want %d", layers[0].Size, layers2[0].Size)
-	}
-
-	if layers[0].MediaType != layers2[0].MediaType {
-		t.Fatalf("got %v != want %v", layers[0].MediaType, layers2[0].MediaType)
-	}
-}
-
-func TestParseLayerFromCopy(t *testing.T) {
-	tempModels := t.TempDir()
-
-	file2, err := os.CreateTemp(tempModels, "")
-	if err != nil {
-		t.Fatalf("failed to open file: %v", err)
-	}
-	defer file2.Close()
-
-	for range 5 {
-		if err := llm.WriteGGUF(file2, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil {
-			t.Fatalf("failed to write gguf: %v", err)
-		}
-	}
-
-	if _, err := file2.Seek(0, io.SeekStart); err != nil {
-		t.Fatalf("failed to seek to start: %v", err)
-	}
-
-	layers, err := parseFromFile(context.Background(), file2, "", func(api.ProgressResponse) {})
-	if err != nil {
-		t.Fatalf("failed to parse from file: %v", err)
-	}
-
-	if len(layers) != 5 {
-		t.Fatalf("got %d != want 5", len(layers))
-	}
-}

From 01d544d373d0f7782a9da2a830e0e7fa6926a584 Mon Sep 17 00:00:00 2001
From: royjhan <65097070+royjhan@users.noreply.github.com>
Date: Mon, 12 Aug 2024 13:33:34 -0400
Subject: [PATCH 30/71] OpenAI: Simplify input output in testing (#5858)

* simplify input output

* direct comp

* in line image

* rm error pointer type

* update response testing

* lint
---
 openai/openai_test.go | 668 ++++++++++++++++++++++--------------------
 1 file changed, 344 insertions(+), 324 deletions(-)

diff --git a/openai/openai_test.go b/openai/openai_test.go
index e08a96c9..c7e9f384 100644
--- a/openai/openai_test.go
+++ b/openai/openai_test.go
@@ -7,27 +7,22 @@ import (
 	"io"
 	"net/http"
 	"net/http/httptest"
+	"reflect"
 	"strings"
 	"testing"
 	"time"
 
 	"github.com/gin-gonic/gin"
-	"github.com/stretchr/testify/assert"
 
 	"github.com/ollama/ollama/api"
 )
 
 const (
-	prefix   = `data:image/jpeg;base64,`
-	image    = `iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=`
-	imageURL = prefix + image
+	prefix = `data:image/jpeg;base64,`
+	image  = `iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=`
 )
 
-func prepareRequest(req *http.Request, body any) {
-	bodyBytes, _ := json.Marshal(body)
-	req.Body = io.NopCloser(bytes.NewReader(bodyBytes))
-	req.Header.Set("Content-Type", "application/json")
-}
+var False = false
 
 func captureRequestMiddleware(capturedRequest any) gin.HandlerFunc {
 	return func(c *gin.Context) {
@@ -43,134 +38,136 @@ func captureRequestMiddleware(capturedRequest any) gin.HandlerFunc {
 
 func TestChatMiddleware(t *testing.T) {
 	type testCase struct {
-		Name     string
-		Setup    func(t *testing.T, req *http.Request)
-		Expected func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder)
+		name string
+		body string
+		req  api.ChatRequest
+		err  ErrorResponse
 	}
 
 	var capturedRequest *api.ChatRequest
 
 	testCases := []testCase{
 		{
-			Name: "chat handler",
-			Setup: func(t *testing.T, req *http.Request) {
-				body := ChatCompletionRequest{
-					Model:    "test-model",
-					Messages: []Message{{Role: "user", Content: "Hello"}},
-				}
-				prepareRequest(req, body)
-			},
-			Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) {
-				if resp.Code != http.StatusOK {
-					t.Fatalf("expected 200, got %d", resp.Code)
-				}
-
-				if req.Messages[0].Role != "user" {
-					t.Fatalf("expected 'user', got %s", req.Messages[0].Role)
-				}
-
-				if req.Messages[0].Content != "Hello" {
-					t.Fatalf("expected 'Hello', got %s", req.Messages[0].Content)
-				}
+			name: "chat handler",
+			body: `{
+				"model": "test-model",
+				"messages": [
+					{"role": "user", "content": "Hello"}
+				]
+			}`,
+			req: api.ChatRequest{
+				Model: "test-model",
+				Messages: []api.Message{
+					{
+						Role:    "user",
+						Content: "Hello",
+					},
+				},
+				Options: map[string]any{
+					"temperature": 1.0,
+					"top_p":       1.0,
+				},
+				Stream: &False,
 			},
 		},
 		{
-			Name: "chat handler with image content",
-			Setup: func(t *testing.T, req *http.Request) {
-				body := ChatCompletionRequest{
-					Model: "test-model",
-					Messages: []Message{
-						{
-							Role: "user", Content: []map[string]any{
-								{"type": "text", "text": "Hello"},
-								{"type": "image_url", "image_url": map[string]string{"url": imageURL}},
+			name: "chat handler with image content",
+			body: `{
+				"model": "test-model",
+				"messages": [
+					{
+						"role": "user",
+						"content": [
+							{
+								"type": "text",
+								"text": "Hello"
+							},
+							{
+								"type": "image_url",
+								"image_url": {
+									"url": "` + prefix + image + `"
+								}
+							}
+						]
+					}
+				]
+			}`,
+			req: api.ChatRequest{
+				Model: "test-model",
+				Messages: []api.Message{
+					{
+						Role:    "user",
+						Content: "Hello",
+					},
+					{
+						Role: "user",
+						Images: []api.ImageData{
+							func() []byte {
+								img, _ := base64.StdEncoding.DecodeString(image)
+								return img
+							}(),
+						},
+					},
+				},
+				Options: map[string]any{
+					"temperature": 1.0,
+					"top_p":       1.0,
+				},
+				Stream: &False,
+			},
+		},
+		{
+			name: "chat handler with tools",
+			body: `{
+				"model": "test-model",
+				"messages": [
+					{"role": "user", "content": "What's the weather like in Paris Today?"},
+					{"role": "assistant", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]}
+				]
+			}`,
+			req: api.ChatRequest{
+				Model: "test-model",
+				Messages: []api.Message{
+					{
+						Role:    "user",
+						Content: "What's the weather like in Paris Today?",
+					},
+					{
+						Role: "assistant",
+						ToolCalls: []api.ToolCall{
+							{
+								Function: api.ToolCallFunction{
+									Name: "get_current_weather",
+									Arguments: map[string]interface{}{
+										"location": "Paris, France",
+										"format":   "celsius",
+									},
+								},
 							},
 						},
 					},
-				}
-				prepareRequest(req, body)
-			},
-			Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) {
-				if resp.Code != http.StatusOK {
-					t.Fatalf("expected 200, got %d", resp.Code)
-				}
-
-				if req.Messages[0].Role != "user" {
-					t.Fatalf("expected 'user', got %s", req.Messages[0].Role)
-				}
-
-				if req.Messages[0].Content != "Hello" {
-					t.Fatalf("expected 'Hello', got %s", req.Messages[0].Content)
-				}
-
-				img, _ := base64.StdEncoding.DecodeString(imageURL[len(prefix):])
-
-				if req.Messages[1].Role != "user" {
-					t.Fatalf("expected 'user', got %s", req.Messages[1].Role)
-				}
-
-				if !bytes.Equal(req.Messages[1].Images[0], img) {
-					t.Fatalf("expected image encoding, got %s", req.Messages[1].Images[0])
-				}
+				},
+				Options: map[string]any{
+					"temperature": 1.0,
+					"top_p":       1.0,
+				},
+				Stream: &False,
 			},
 		},
+
 		{
-			Name: "chat handler with tools",
-			Setup: func(t *testing.T, req *http.Request) {
-				body := ChatCompletionRequest{
-					Model: "test-model",
-					Messages: []Message{
-						{Role: "user", Content: "What's the weather like in Paris Today?"},
-						{Role: "assistant", ToolCalls: []ToolCall{{
-							ID:   "id",
-							Type: "function",
-							Function: struct {
-								Name      string `json:"name"`
-								Arguments string `json:"arguments"`
-							}{
-								Name:      "get_current_weather",
-								Arguments: "{\"location\": \"Paris, France\", \"format\": \"celsius\"}",
-							},
-						}}},
-					},
-				}
-				prepareRequest(req, body)
-			},
-			Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) {
-				if resp.Code != 200 {
-					t.Fatalf("expected 200, got %d", resp.Code)
-				}
-
-				if req.Messages[0].Content != "What's the weather like in Paris Today?" {
-					t.Fatalf("expected What's the weather like in Paris Today?, got %s", req.Messages[0].Content)
-				}
-
-				if req.Messages[1].ToolCalls[0].Function.Arguments["location"] != "Paris, France" {
-					t.Fatalf("expected 'Paris, France', got %v", req.Messages[1].ToolCalls[0].Function.Arguments["location"])
-				}
-
-				if req.Messages[1].ToolCalls[0].Function.Arguments["format"] != "celsius" {
-					t.Fatalf("expected celsius, got %v", req.Messages[1].ToolCalls[0].Function.Arguments["format"])
-				}
-			},
-		},
-		{
-			Name: "chat handler error forwarding",
-			Setup: func(t *testing.T, req *http.Request) {
-				body := ChatCompletionRequest{
-					Model:    "test-model",
-					Messages: []Message{{Role: "user", Content: 2}},
-				}
-				prepareRequest(req, body)
-			},
-			Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) {
-				if resp.Code != http.StatusBadRequest {
-					t.Fatalf("expected 400, got %d", resp.Code)
-				}
-
-				if !strings.Contains(resp.Body.String(), "invalid message content type") {
-					t.Fatalf("error was not forwarded")
-				}
+			name: "chat handler error forwarding",
+			body: `{
+				"model": "test-model",
+				"messages": [
+					{"role": "user", "content": 2}
+				]
+			}`,
+			err: ErrorResponse{
+				Error: Error{
+					Message: "invalid message content type: float64",
+					Type:    "invalid_request_error",
+				},
 			},
 		},
 	}
@@ -185,16 +182,26 @@ func TestChatMiddleware(t *testing.T) {
 	router.Handle(http.MethodPost, "/api/chat", endpoint)
 
 	for _, tc := range testCases {
-		t.Run(tc.Name, func(t *testing.T) {
-			req, _ := http.NewRequest(http.MethodPost, "/api/chat", nil)
-
-			tc.Setup(t, req)
+		t.Run(tc.name, func(t *testing.T) {
+			req, _ := http.NewRequest(http.MethodPost, "/api/chat", strings.NewReader(tc.body))
+			req.Header.Set("Content-Type", "application/json")
 
 			resp := httptest.NewRecorder()
 			router.ServeHTTP(resp, req)
 
-			tc.Expected(t, capturedRequest, resp)
+			var errResp ErrorResponse
+			if resp.Code != http.StatusOK {
+				if err := json.Unmarshal(resp.Body.Bytes(), &errResp); err != nil {
+					t.Fatal(err)
+				}
+			}
+			if capturedRequest != nil && !reflect.DeepEqual(tc.req, *capturedRequest) {
+				t.Fatal("requests did not match")
+			}
 
+			if !reflect.DeepEqual(tc.err, errResp) {
+				t.Fatal("errors did not match")
+			}
 			capturedRequest = nil
 		})
 	}
@@ -202,71 +209,52 @@ func TestChatMiddleware(t *testing.T) {
 
 func TestCompletionsMiddleware(t *testing.T) {
 	type testCase struct {
-		Name     string
-		Setup    func(t *testing.T, req *http.Request)
-		Expected func(t *testing.T, req *api.GenerateRequest, resp *httptest.ResponseRecorder)
+		name string
+		body string
+		req  api.GenerateRequest
+		err  ErrorResponse
 	}
 
 	var capturedRequest *api.GenerateRequest
 
 	testCases := []testCase{
 		{
-			Name: "completions handler",
-			Setup: func(t *testing.T, req *http.Request) {
-				temp := float32(0.8)
-				body := CompletionRequest{
-					Model:       "test-model",
-					Prompt:      "Hello",
-					Temperature: &temp,
-					Stop:        []string{"\n", "stop"},
-					Suffix:      "suffix",
-				}
-				prepareRequest(req, body)
-			},
-			Expected: func(t *testing.T, req *api.GenerateRequest, resp *httptest.ResponseRecorder) {
-				if req.Prompt != "Hello" {
-					t.Fatalf("expected 'Hello', got %s", req.Prompt)
-				}
-
-				if req.Options["temperature"] != 1.6 {
-					t.Fatalf("expected 1.6, got %f", req.Options["temperature"])
-				}
-
-				stopTokens, ok := req.Options["stop"].([]any)
-
-				if !ok {
-					t.Fatalf("expected stop tokens to be a list")
-				}
-
-				if stopTokens[0] != "\n" || stopTokens[1] != "stop" {
-					t.Fatalf("expected ['\\n', 'stop'], got %v", stopTokens)
-				}
-
-				if req.Suffix != "suffix" {
-					t.Fatalf("expected 'suffix', got %s", req.Suffix)
-				}
+			name: "completions handler",
+			body: `{
+				"model": "test-model",
+				"prompt": "Hello",
+				"temperature": 0.8,
+				"stop": ["\n", "stop"],
+				"suffix": "suffix"
+			}`,
+			req: api.GenerateRequest{
+				Model:  "test-model",
+				Prompt: "Hello",
+				Options: map[string]any{
+					"frequency_penalty": 0.0,
+					"presence_penalty":  0.0,
+					"temperature":       1.6,
+					"top_p":             1.0,
+					"stop":              []any{"\n", "stop"},
+				},
+				Suffix: "suffix",
+				Stream: &False,
 			},
 		},
 		{
-			Name: "completions handler error forwarding",
-			Setup: func(t *testing.T, req *http.Request) {
-				body := CompletionRequest{
-					Model:       "test-model",
-					Prompt:      "Hello",
-					Temperature: nil,
-					Stop:        []int{1, 2},
-					Suffix:      "suffix",
-				}
-				prepareRequest(req, body)
-			},
-			Expected: func(t *testing.T, req *api.GenerateRequest, resp *httptest.ResponseRecorder) {
-				if resp.Code != http.StatusBadRequest {
-					t.Fatalf("expected 400, got %d", resp.Code)
-				}
-
-				if !strings.Contains(resp.Body.String(), "invalid type for 'stop' field") {
-					t.Fatalf("error was not forwarded")
-				}
+			name: "completions handler error forwarding",
+			body: `{
+				"model": "test-model",
+				"prompt": "Hello",
+				"temperature": null,
+				"stop": [1, 2],
+				"suffix": "suffix"
+			}`,
+			err: ErrorResponse{
+				Error: Error{
+					Message: "invalid type for 'stop' field: float64",
+					Type:    "invalid_request_error",
+				},
 			},
 		},
 	}
@@ -281,15 +269,27 @@ func TestCompletionsMiddleware(t *testing.T) {
 	router.Handle(http.MethodPost, "/api/generate", endpoint)
 
 	for _, tc := range testCases {
-		t.Run(tc.Name, func(t *testing.T) {
-			req, _ := http.NewRequest(http.MethodPost, "/api/generate", nil)
-
-			tc.Setup(t, req)
+		t.Run(tc.name, func(t *testing.T) {
+			req, _ := http.NewRequest(http.MethodPost, "/api/generate", strings.NewReader(tc.body))
+			req.Header.Set("Content-Type", "application/json")
 
 			resp := httptest.NewRecorder()
 			router.ServeHTTP(resp, req)
 
-			tc.Expected(t, capturedRequest, resp)
+			var errResp ErrorResponse
+			if resp.Code != http.StatusOK {
+				if err := json.Unmarshal(resp.Body.Bytes(), &errResp); err != nil {
+					t.Fatal(err)
+				}
+			}
+
+			if capturedRequest != nil && !reflect.DeepEqual(tc.req, *capturedRequest) {
+				t.Fatal("requests did not match")
+			}
+
+			if !reflect.DeepEqual(tc.err, errResp) {
+				t.Fatal("errors did not match")
+			}
 
 			capturedRequest = nil
 		})
@@ -298,78 +298,47 @@ func TestCompletionsMiddleware(t *testing.T) {
 
 func TestEmbeddingsMiddleware(t *testing.T) {
 	type testCase struct {
-		Name     string
-		Setup    func(t *testing.T, req *http.Request)
-		Expected func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder)
+		name string
+		body string
+		req  api.EmbedRequest
+		err  ErrorResponse
 	}
 
 	var capturedRequest *api.EmbedRequest
 
 	testCases := []testCase{
 		{
-			Name: "embed handler single input",
-			Setup: func(t *testing.T, req *http.Request) {
-				body := EmbedRequest{
-					Input: "Hello",
-					Model: "test-model",
-				}
-				prepareRequest(req, body)
-			},
-			Expected: func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder) {
-				if req.Input != "Hello" {
-					t.Fatalf("expected 'Hello', got %s", req.Input)
-				}
-
-				if req.Model != "test-model" {
-					t.Fatalf("expected 'test-model', got %s", req.Model)
-				}
+			name: "embed handler single input",
+			body: `{
+				"input": "Hello",
+				"model": "test-model"
+			}`,
+			req: api.EmbedRequest{
+				Input: "Hello",
+				Model: "test-model",
 			},
 		},
 		{
-			Name: "embed handler batch input",
-			Setup: func(t *testing.T, req *http.Request) {
-				body := EmbedRequest{
-					Input: []string{"Hello", "World"},
-					Model: "test-model",
-				}
-				prepareRequest(req, body)
-			},
-			Expected: func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder) {
-				input, ok := req.Input.([]any)
-
-				if !ok {
-					t.Fatalf("expected input to be a list")
-				}
-
-				if input[0].(string) != "Hello" {
-					t.Fatalf("expected 'Hello', got %s", input[0])
-				}
-
-				if input[1].(string) != "World" {
-					t.Fatalf("expected 'World', got %s", input[1])
-				}
-
-				if req.Model != "test-model" {
-					t.Fatalf("expected 'test-model', got %s", req.Model)
-				}
+			name: "embed handler batch input",
+			body: `{
+				"input": ["Hello", "World"],
+				"model": "test-model"
+			}`,
+			req: api.EmbedRequest{
+				Input: []any{"Hello", "World"},
+				Model: "test-model",
 			},
 		},
 		{
-			Name: "embed handler error forwarding",
-			Setup: func(t *testing.T, req *http.Request) {
-				body := EmbedRequest{
-					Model: "test-model",
-				}
-				prepareRequest(req, body)
-			},
-			Expected: func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder) {
-				if resp.Code != http.StatusBadRequest {
-					t.Fatalf("expected 400, got %d", resp.Code)
-				}
-
-				if !strings.Contains(resp.Body.String(), "invalid input") {
-					t.Fatalf("error was not forwarded")
-				}
+			name: "embed handler error forwarding",
+			body: `{
+				"model": "test-model"
+			}`,
+			err: ErrorResponse{
+				Error: Error{
+					Message: "invalid input",
+					Type:    "invalid_request_error",
+				},
 			},
 		},
 	}
@@ -384,116 +353,167 @@ func TestEmbeddingsMiddleware(t *testing.T) {
 	router.Handle(http.MethodPost, "/api/embed", endpoint)
 
 	for _, tc := range testCases {
-		t.Run(tc.Name, func(t *testing.T) {
-			req, _ := http.NewRequest(http.MethodPost, "/api/embed", nil)
-
-			tc.Setup(t, req)
+		t.Run(tc.name, func(t *testing.T) {
+			req, _ := http.NewRequest(http.MethodPost, "/api/embed", strings.NewReader(tc.body))
+			req.Header.Set("Content-Type", "application/json")
 
 			resp := httptest.NewRecorder()
 			router.ServeHTTP(resp, req)
 
-			tc.Expected(t, capturedRequest, resp)
+			var errResp ErrorResponse
+			if resp.Code != http.StatusOK {
+				if err := json.Unmarshal(resp.Body.Bytes(), &errResp); err != nil {
+					t.Fatal(err)
+				}
+			}
+
+			if capturedRequest != nil && !reflect.DeepEqual(tc.req, *capturedRequest) {
+				t.Fatal("requests did not match")
+			}
+
+			if !reflect.DeepEqual(tc.err, errResp) {
+				t.Fatal("errors did not match")
+			}
 
 			capturedRequest = nil
 		})
 	}
 }
 
-func TestMiddlewareResponses(t *testing.T) {
+func TestListMiddleware(t *testing.T) {
 	type testCase struct {
-		Name     string
-		Method   string
-		Path     string
-		TestPath string
-		Handler  func() gin.HandlerFunc
-		Endpoint func(c *gin.Context)
-		Setup    func(t *testing.T, req *http.Request)
-		Expected func(t *testing.T, resp *httptest.ResponseRecorder)
+		name     string
+		endpoint func(c *gin.Context)
+		resp     string
 	}
 
 	testCases := []testCase{
 		{
-			Name:     "list handler",
-			Method:   http.MethodGet,
-			Path:     "/api/tags",
-			TestPath: "/api/tags",
-			Handler:  ListMiddleware,
-			Endpoint: func(c *gin.Context) {
+			name: "list handler",
+			endpoint: func(c *gin.Context) {
 				c.JSON(http.StatusOK, api.ListResponse{
 					Models: []api.ListModelResponse{
 						{
-							Name: "Test Model",
+							Name:       "test-model",
+							ModifiedAt: time.Unix(int64(1686935002), 0).UTC(),
 						},
 					},
 				})
 			},
-			Expected: func(t *testing.T, resp *httptest.ResponseRecorder) {
-				var listResp ListCompletion
-				if err := json.NewDecoder(resp.Body).Decode(&listResp); err != nil {
-					t.Fatal(err)
-				}
-
-				if listResp.Object != "list" {
-					t.Fatalf("expected list, got %s", listResp.Object)
-				}
-
-				if len(listResp.Data) != 1 {
-					t.Fatalf("expected 1, got %d", len(listResp.Data))
-				}
-
-				if listResp.Data[0].Id != "Test Model" {
-					t.Fatalf("expected Test Model, got %s", listResp.Data[0].Id)
-				}
-			},
+			resp: `{
+				"object": "list",
+				"data": [
+					{
+						"id": "test-model",
+						"object": "model",
+						"created": 1686935002,
+						"owned_by": "library"
+					}
+				]
+			}`,
 		},
 		{
-			Name:     "retrieve model",
-			Method:   http.MethodGet,
-			Path:     "/api/show/:model",
-			TestPath: "/api/show/test-model",
-			Handler:  RetrieveMiddleware,
-			Endpoint: func(c *gin.Context) {
-				c.JSON(http.StatusOK, api.ShowResponse{
-					ModifiedAt: time.Date(2024, 6, 17, 13, 45, 0, 0, time.UTC),
-				})
-			},
-			Expected: func(t *testing.T, resp *httptest.ResponseRecorder) {
-				var retrieveResp Model
-				if err := json.NewDecoder(resp.Body).Decode(&retrieveResp); err != nil {
-					t.Fatal(err)
-				}
-
-				if retrieveResp.Object != "model" {
-					t.Fatalf("Expected object to be model, got %s", retrieveResp.Object)
-				}
-
-				if retrieveResp.Id != "test-model" {
-					t.Fatalf("Expected id to be test-model, got %s", retrieveResp.Id)
-				}
+			name: "list handler empty output",
+			endpoint: func(c *gin.Context) {
+				c.JSON(http.StatusOK, api.ListResponse{})
 			},
+			resp: `{
+				"object": "list",
+				"data": null
+			}`,
 		},
 	}
 
 	gin.SetMode(gin.TestMode)
-	router := gin.New()
 
 	for _, tc := range testCases {
-		t.Run(tc.Name, func(t *testing.T) {
-			router = gin.New()
-			router.Use(tc.Handler())
-			router.Handle(tc.Method, tc.Path, tc.Endpoint)
-			req, _ := http.NewRequest(tc.Method, tc.TestPath, nil)
+		router := gin.New()
+		router.Use(ListMiddleware())
+		router.Handle(http.MethodGet, "/api/tags", tc.endpoint)
+		req, _ := http.NewRequest(http.MethodGet, "/api/tags", nil)
 
-			if tc.Setup != nil {
-				tc.Setup(t, req)
-			}
+		resp := httptest.NewRecorder()
+		router.ServeHTTP(resp, req)
 
-			resp := httptest.NewRecorder()
-			router.ServeHTTP(resp, req)
+		var expected, actual map[string]any
+		err := json.Unmarshal([]byte(tc.resp), &expected)
+		if err != nil {
+			t.Fatalf("failed to unmarshal expected response: %v", err)
+		}
 
-			assert.Equal(t, http.StatusOK, resp.Code)
+		err = json.Unmarshal(resp.Body.Bytes(), &actual)
+		if err != nil {
+			t.Fatalf("failed to unmarshal actual response: %v", err)
+		}
 
-			tc.Expected(t, resp)
-		})
+		if !reflect.DeepEqual(expected, actual) {
+			t.Errorf("responses did not match\nExpected: %+v\nActual: %+v", expected, actual)
+		}
+	}
+}
+
+func TestRetrieveMiddleware(t *testing.T) {
+	type testCase struct {
+		name     string
+		endpoint func(c *gin.Context)
+		resp     string
+	}
+
+	testCases := []testCase{
+		{
+			name: "retrieve handler",
+			endpoint: func(c *gin.Context) {
+				c.JSON(http.StatusOK, api.ShowResponse{
+					ModifiedAt: time.Unix(int64(1686935002), 0).UTC(),
+				})
+			},
+			resp: `{
+				"id":"test-model",
+				"object":"model",
+				"created":1686935002,
+				"owned_by":"library"}
+			`,
+		},
+		{
+			name: "retrieve handler error forwarding",
+			endpoint: func(c *gin.Context) {
+				c.JSON(http.StatusBadRequest, gin.H{"error": "model not found"})
+			},
+			resp: `{
+				"error": {
+				  "code": null,
+				  "message": "model not found",
+				  "param": null,
+				  "type": "api_error"
+				}
+			}`,
+		},
+	}
+
+	gin.SetMode(gin.TestMode)
+
+	for _, tc := range testCases {
+		router := gin.New()
+		router.Use(RetrieveMiddleware())
+		router.Handle(http.MethodGet, "/api/show/:model", tc.endpoint)
+		req, _ := http.NewRequest(http.MethodGet, "/api/show/test-model", nil)
+
+		resp := httptest.NewRecorder()
+		router.ServeHTTP(resp, req)
+
+		var expected, actual map[string]any
+		err := json.Unmarshal([]byte(tc.resp), &expected)
+		if err != nil {
+			t.Fatalf("failed to unmarshal expected response: %v", err)
+		}
+
+		err = json.Unmarshal(resp.Body.Bytes(), &actual)
+		if err != nil {
+			t.Fatalf("failed to unmarshal actual response: %v", err)
+		}
+
+		if !reflect.DeepEqual(expected, actual) {
+			t.Errorf("responses did not match\nExpected: %+v\nActual: %+v", expected, actual)
+		}
 	}
 }

From 980dd15f81e9021c5165a1e516748d42cf134339 Mon Sep 17 00:00:00 2001
From: Josh <76125168+joshyan1@users.noreply.github.com>
Date: Mon, 12 Aug 2024 11:46:09 -0700
Subject: [PATCH 31/71] cmd: speed up gguf creates (#6324)

---
 server/model.go      | 17 +++++++--
 server/model_test.go | 82 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 96 insertions(+), 3 deletions(-)

diff --git a/server/model.go b/server/model.go
index ad6e4e55..b17bf0e3 100644
--- a/server/model.go
+++ b/server/model.go
@@ -176,9 +176,20 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
 			mediatype = "application/vnd.ollama.image.projector"
 		}
 
-		layer, err := NewLayer(io.NewSectionReader(file, offset, n), mediatype)
-		if err != nil {
-			return nil, err
+		var layer Layer
+		if digest != "" && n == stat.Size() && offset == 0 {
+			layer, err = NewLayerFromLayer(digest, mediatype, file.Name())
+			if err != nil {
+				slog.Debug("could not create new layer from layer", "error", err)
+			}
+		}
+
+		// Fallback to creating layer from file copy (either NewLayerFromLayer failed, or digest empty/n != stat.Size())
+		if layer.Digest == "" {
+			layer, err = NewLayer(io.NewSectionReader(file, offset, n), mediatype)
+			if err != nil {
+				return nil, err
+			}
 		}
 
 		layers = append(layers, &layerGGML{layer, ggml})
diff --git a/server/model_test.go b/server/model_test.go
index aa214d3d..63fc408d 100644
--- a/server/model_test.go
+++ b/server/model_test.go
@@ -2,8 +2,10 @@ package server
 
 import (
 	"bytes"
+	"context"
 	"encoding/json"
 	"fmt"
+	"io"
 	"os"
 	"path/filepath"
 	"testing"
@@ -11,6 +13,7 @@ import (
 	"github.com/google/go-cmp/cmp"
 
 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/template"
 )
 
@@ -133,3 +136,82 @@ The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`,
 		})
 	}
 }
+
+func TestParseFromFileFromLayer(t *testing.T) {
+	tempModels := t.TempDir()
+
+	file, err := os.CreateTemp(tempModels, "")
+	if err != nil {
+		t.Fatalf("failed to open file: %v", err)
+	}
+	defer file.Close()
+	if err := llm.WriteGGUF(file, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil {
+		t.Fatalf("failed to write gguf: %v", err)
+	}
+
+	if _, err := file.Seek(0, io.SeekStart); err != nil {
+		t.Fatalf("failed to seek to start: %v", err)
+	}
+
+	layers, err := parseFromFile(context.Background(), file, "", func(api.ProgressResponse) {})
+	if err != nil {
+		t.Fatalf("failed to parse from file: %v", err)
+	}
+
+	if len(layers) != 1 {
+		t.Fatalf("got %d != want 1", len(layers))
+	}
+
+	if _, err := file.Seek(0, io.SeekStart); err != nil {
+		t.Fatalf("failed to seek to start: %v", err)
+	}
+
+	layers2, err := parseFromFile(context.Background(), file, layers[0].Digest, func(api.ProgressResponse) {})
+	if err != nil {
+		t.Fatalf("failed to parse from file: %v", err)
+	}
+	if len(layers2) != 1 {
+		t.Fatalf("got %d != want 1", len(layers2))
+	}
+
+	if layers[0].Digest != layers2[0].Digest {
+		t.Fatalf("got %s != want %s", layers[0].Digest, layers2[0].Digest)
+	}
+
+	if layers[0].Size != layers2[0].Size {
+		t.Fatalf("got %d != want %d", layers[0].Size, layers2[0].Size)
+	}
+
+	if layers[0].MediaType != layers2[0].MediaType {
+		t.Fatalf("got %v != want %v", layers[0].MediaType, layers2[0].MediaType)
+	}
+}
+
+func TestParseLayerFromCopy(t *testing.T) {
+	tempModels := t.TempDir()
+
+	file2, err := os.CreateTemp(tempModels, "")
+	if err != nil {
+		t.Fatalf("failed to open file: %v", err)
+	}
+	defer file2.Close()
+
+	for range 5 {
+		if err := llm.WriteGGUF(file2, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil {
+			t.Fatalf("failed to write gguf: %v", err)
+		}
+	}
+
+	if _, err := file2.Seek(0, io.SeekStart); err != nil {
+		t.Fatalf("failed to seek to start: %v", err)
+	}
+
+	layers, err := parseFromFile(context.Background(), file2, "", func(api.ProgressResponse) {})
+	if err != nil {
+		t.Fatalf("failed to parse from file: %v", err)
+	}
+
+	if len(layers) != 5 {
+		t.Fatalf("got %d != want 5", len(layers))
+	}
+}

From f7e3b9190f7e8f99bac8af432b9539e24cd3b57e Mon Sep 17 00:00:00 2001
From: Josh <76125168+joshyan1@users.noreply.github.com>
Date: Mon, 12 Aug 2024 11:46:32 -0700
Subject: [PATCH 32/71] cmd: spinner progress for transfer model data (#6100)

---
 cmd/cmd.go          | 45 ++++++++++++++++++++++++++++++++++++++++++---
 progress/spinner.go | 14 ++++++++++----
 2 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index d47db65b..2356110e 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -22,6 +22,7 @@ import (
 	"runtime"
 	"slices"
 	"strings"
+	"sync/atomic"
 	"syscall"
 	"time"
 
@@ -78,6 +79,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	status := "transferring model data"
 	spinner := progress.NewSpinner(status)
 	p.Add(status, spinner)
+	defer p.Stop()
 
 	for i := range modelfile.Commands {
 		switch modelfile.Commands[i].Name {
@@ -112,7 +114,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 				path = tempfile
 			}
 
-			digest, err := createBlob(cmd, client, path)
+			digest, err := createBlob(cmd, client, path, spinner)
 			if err != nil {
 				return err
 			}
@@ -263,13 +265,20 @@ func tempZipFiles(path string) (string, error) {
 	return tempfile.Name(), nil
 }
 
-func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, error) {
+func createBlob(cmd *cobra.Command, client *api.Client, path string, spinner *progress.Spinner) (string, error) {
 	bin, err := os.Open(path)
 	if err != nil {
 		return "", err
 	}
 	defer bin.Close()
 
+	// Get file info to retrieve the size
+	fileInfo, err := bin.Stat()
+	if err != nil {
+		return "", err
+	}
+	fileSize := fileInfo.Size()
+
 	hash := sha256.New()
 	if _, err := io.Copy(hash, bin); err != nil {
 		return "", err
@@ -279,13 +288,43 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, er
 		return "", err
 	}
 
+	var pw progressWriter
+	status := "transferring model data 0%"
+	spinner.SetMessage(status)
+
+	done := make(chan struct{})
+	defer close(done)
+
+	go func() {
+		ticker := time.NewTicker(60 * time.Millisecond)
+		defer ticker.Stop()
+		for {
+			select {
+			case <-ticker.C:
+				spinner.SetMessage(fmt.Sprintf("transferring model data %d%%", int(100*pw.n.Load()/fileSize)))
+			case <-done:
+				spinner.SetMessage("transferring model data 100%")
+				return
+			}
+		}
+	}()
+
 	digest := fmt.Sprintf("sha256:%x", hash.Sum(nil))
-	if err = client.CreateBlob(cmd.Context(), digest, bin); err != nil {
+	if err = client.CreateBlob(cmd.Context(), digest, io.TeeReader(bin, &pw)); err != nil {
 		return "", err
 	}
 	return digest, nil
 }
 
+type progressWriter struct {
+	n atomic.Int64
+}
+
+func (w *progressWriter) Write(p []byte) (n int, err error) {
+	w.n.Add(int64(len(p)))
+	return len(p), nil
+}
+
 func RunHandler(cmd *cobra.Command, args []string) error {
 	interactive := true
 
diff --git a/progress/spinner.go b/progress/spinner.go
index 02f3f9fb..e39a45ee 100644
--- a/progress/spinner.go
+++ b/progress/spinner.go
@@ -3,11 +3,12 @@ package progress
 import (
 	"fmt"
 	"strings"
+	"sync/atomic"
 	"time"
 )
 
 type Spinner struct {
-	message      string
+	message      atomic.Value
 	messageWidth int
 
 	parts []string
@@ -21,20 +22,25 @@ type Spinner struct {
 
 func NewSpinner(message string) *Spinner {
 	s := &Spinner{
-		message: message,
 		parts: []string{
 			"⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏",
 		},
 		started: time.Now(),
 	}
+	s.SetMessage(message)
 	go s.start()
 	return s
 }
 
+func (s *Spinner) SetMessage(message string) {
+	s.message.Store(message)
+}
+
 func (s *Spinner) String() string {
 	var sb strings.Builder
-	if len(s.message) > 0 {
-		message := strings.TrimSpace(s.message)
+
+	if message, ok := s.message.Load().(string); ok && len(message) > 0 {
+		message := strings.TrimSpace(message)
 		if s.messageWidth > 0 && len(message) > s.messageWidth {
 			message = message[:s.messageWidth]
 		}

From 6ffb5cb017a1c81970ac637907a8ba6fd151e0e7 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 3 Jun 2024 15:53:58 -0700
Subject: [PATCH 33/71] add conversion for microsoft phi 3 mini/medium 4k, 128

---
 convert/convert.go                            |   6 +
 convert/convert_llama.go                      |   4 -
 convert/convert_phi3.go                       | 125 ++++++++++
 convert/convert_test.go                       |   2 +
 .../testdata/Phi-3-mini-128k-instruct.json    | 225 ++++++++++++++++++
 llm/ggml.go                                   |   8 +
 llm/gguf.go                                   |  15 +-
 7 files changed, 373 insertions(+), 12 deletions(-)
 create mode 100644 convert/convert_phi3.go
 create mode 100644 convert/testdata/Phi-3-mini-128k-instruct.json

diff --git a/convert/convert.go b/convert/convert.go
index b9461e4f..24c19aa4 100644
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -27,6 +27,10 @@ func (Parameters) KV(t *Tokenizer) llm.KV {
 		"tokenizer.ggml.token_type":    t.Vocabulary.Types,
 	}
 
+	if len(t.Merges) > 0 {
+		kv["tokenizer.ggml.merges"] = t.Merges
+	}
+
 	if t.Template != "" {
 		kv["tokenizer.chat_template"] = t.Template
 	}
@@ -89,6 +93,8 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
 		conv = &mixtral{}
 	case "GemmaForCausalLM":
 		conv = &gemma{}
+	case "Phi3ForCausalLM":
+		conv = &phi3{}
 	default:
 		return errors.New("unsupported architecture")
 	}
diff --git a/convert/convert_llama.go b/convert/convert_llama.go
index 0383a85e..178b13f3 100644
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -90,10 +90,6 @@ func (p *llama) KV(t *Tokenizer) llm.KV {
 		kv["llama.attention.value_length"] = p.HeadDim
 	}
 
-	if len(t.Merges) > 0 {
-		kv["tokenizer.ggml.merges"] = t.Merges
-	}
-
 	return kv
 }
 
diff --git a/convert/convert_phi3.go b/convert/convert_phi3.go
new file mode 100644
index 00000000..7aa3ed15
--- /dev/null
+++ b/convert/convert_phi3.go
@@ -0,0 +1,125 @@
+package convert
+
+import (
+	"cmp"
+	"encoding/binary"
+	"io"
+	"math"
+	"strings"
+	"sync"
+
+	"github.com/ollama/ollama/llm"
+)
+
+type phi3 struct {
+	Parameters
+	NumHiddenLayers   uint32  `json:"num_hidden_layers"`
+	NLayers           uint32  `json:"n_layers"`
+	HiddenSize        uint32  `json:"hidden_size"`
+	NEmbd             uint32  `json:"n_embd"`
+	IntermediateSize  uint32  `json:"intermediate_size"`
+	NumAttentionHeads uint32  `json:"num_attention_heads"`
+	NHead             uint32  `json:"n_head"`
+	NumKeyValueHeads  uint32  `json:"num_key_value_heads"`
+	NHeadKV           uint32  `json:"n_head_kv"`
+	RopeTheta         float32 `json:"rope_theta"`
+	RopeScaling       struct {
+		Type        string     `json:"type"`
+		LongFactor  ropeFactor `json:"long_factor"`
+		ShortFactor ropeFactor `json:"short_factor"`
+	} `json:"rope_scaling"`
+	RMSNormEPS                    float32 `json:"rms_norm_eps"`
+	NPositions                    uint32  `json:"n_positions"`
+	MaxPositionEmbeddings         uint32  `json:"max_position_embeddings"`
+	OriginalMaxPositionEmbeddings uint32  `json:"original_max_position_embeddings"`
+	SlidingWindow                 uint32  `json:"sliding_window"`
+}
+
+var _ Converter = (*phi3)(nil)
+
+func (p *phi3) KV(t *Tokenizer) llm.KV {
+	kv := p.Parameters.KV(t)
+	kv["general.architecture"] = "phi3"
+	kv["general.name"] = "phi3"
+	kv["phi3.context_length"] = p.MaxPositionEmbeddings
+	kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
+	kv["phi3.feed_forward_length"] = p.IntermediateSize
+	kv["phi3.block_count"] = cmp.Or(p.NumHiddenLayers, p.NLayers)
+	kv["phi3.attention.head_count"] = cmp.Or(p.NumAttentionHeads, p.NHead)
+	kv["phi3.attention.head_count_kv"] = cmp.Or(p.NumKeyValueHeads, p.NHeadKV)
+	kv["phi3.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
+	kv["phi3.rope.dimension_count"] = p.HiddenSize / cmp.Or(p.NumAttentionHeads, p.NHead)
+	kv["phi3.rope.freq_base"] = p.RopeTheta
+	kv["phi3.rope.scaling.original_context_length"] = p.OriginalMaxPositionEmbeddings
+	kv["phi3.attention.sliding_window"] = p.SlidingWindow
+
+	scale := float64(p.MaxPositionEmbeddings) / float64(p.OriginalMaxPositionEmbeddings)
+
+	switch p.RopeScaling.Type {
+	case "":
+		// no scaling
+	case "su":
+		kv["phi3.rope.scaling.attn_factor"] = float32(max(math.Sqrt(1+math.Log(scale)/math.Log(float64(p.OriginalMaxPositionEmbeddings))), 1.0))
+	case "yarn":
+		kv["phi3.rope.scaling.attn_factor"] = float32(max(0.1*math.Log(scale)+1.0, 1.0))
+	default:
+		panic("unknown rope scaling type")
+	}
+
+	return kv
+}
+
+func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
+	var addRopeFactors sync.Once
+
+	out := make([]llm.Tensor, 0, len(ts)+2)
+	for _, t := range ts {
+		name := p.tensorName(t.Name())
+		if strings.HasPrefix(name, "blk.0.") {
+			addRopeFactors.Do(func() {
+				out = append(out, llm.Tensor{
+					Name:     "rope_factors_long.weight",
+					Kind:     0,
+					Shape:    []uint64{uint64(len(p.RopeScaling.LongFactor))},
+					WriterTo: p.RopeScaling.LongFactor,
+				}, llm.Tensor{
+					Name:     "rope_factors_short.weight",
+					Kind:     0,
+					Shape:    []uint64{uint64(len(p.RopeScaling.ShortFactor))},
+					WriterTo: p.RopeScaling.ShortFactor,
+				})
+			})
+		}
+
+		out = append(out, llm.Tensor{
+			Name:     name,
+			Kind:     t.Kind(),
+			Shape:    t.Shape(),
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (p *phi3) tensorName(n string) string {
+	return strings.NewReplacer(
+		"lm_head", "output",
+		"model.embed_tokens", "token_embd",
+		"model.norm", "output_norm",
+		"model.layers", "blk",
+		"input_layernorm", "attn_norm",
+		"self_attn.qkv_proj", "attn_qkv",
+		"self_attn.o_proj", "attn_output",
+		"mlp.down_proj", "ffn_down",
+		"mlp.gate_up_proj", "ffn_up",
+		"post_attention_layernorm", "ffn_norm",
+	).Replace(n)
+}
+
+type ropeFactor []float32
+
+func (r ropeFactor) WriteTo(w io.Writer) (int64, error) {
+	err := binary.Write(w, binary.LittleEndian, r)
+	return 0, err
+}
diff --git a/convert/convert_test.go b/convert/convert_test.go
index 88f38494..cb2c585e 100644
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -65,6 +65,8 @@ func TestConvertFull(t *testing.T) {
 		"Mistral-7B-Instruct-v0.2",
 		"Mixtral-8x7B-Instruct-v0.1",
 		"gemma-2b-it",
+		// microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8
+		"Phi-3-mini-128k-instruct",
 	}
 
 	for i := range cases {
diff --git a/convert/testdata/Phi-3-mini-128k-instruct.json b/convert/testdata/Phi-3-mini-128k-instruct.json
new file mode 100644
index 00000000..19296f5a
--- /dev/null
+++ b/convert/testdata/Phi-3-mini-128k-instruct.json
@@ -0,0 +1,225 @@
+{
+  "general.architecture": "phi3",
+  "general.file_type": "1",
+  "general.quantization_version": "2",
+  "phi3.block_count": "32",
+  "phi3.context_length": "131072",
+  "phi3.embedding_length": "3072",
+  "phi3.feed_forward_length": "8192",
+  "phi3.rope.scaling.original_context_length": "4096",
+  "phi3.rope.dimension_count": "96",
+  "phi3.rope.freq_base": "10000",
+  "phi3.rope.scaling.attn_factor": "1.1902381",
+  "phi3.attention.head_count": "32",
+  "phi3.attention.head_count_kv": "32",
+  "phi3.attention.layer_norm_rms_epsilon": "1e-05",
+  "phi3.attention.sliding_window": "262144",
+  "tokenizer.ggml.model": "llama",
+  "tokenizer.ggml.pre": "default",
+  "tokenizer.ggml.add_bos_token": "false",
+  "tokenizer.ggml.add_eos_token": "false",
+  "tokenizer.ggml.bos_token_id": "1",
+  "tokenizer.ggml.eos_token_id": "32000",
+  "tokenizer.ggml.unknown_token_id": "0",
+  "tokenizer.ggml.padding_token_id": "32000",
+  "tokenizer.ggml.scores": "6e37bcde2adc7e350e87c496eddd7a2124329c1dc66c5bf3ad3997253e4f7a62",
+  "tokenizer.ggml.token_type": "b6ecf55ec64ee67d87750bdb8d757a2c58bf78377e9f4219f5689a6c4dea57ce",
+  "tokenizer.ggml.tokens": "d168da3ddd3eee820916945fcb9baf24dd3cde42f606cffa2d19e7c8a8743918",
+  "blk.0.attn_norm.weight": "216aeb2c9e0c271f899e1ef2a63cceeb8f41e97642e84fada54b1d3c1c11cf25",
+  "blk.0.attn_output.weight": "b597d56f7188ffc1fafc273fadc59d41738cffd677ae98c61a62c3285b3a3099",
+  "blk.0.attn_qkv.weight": "d28a6b44e13f59be5483e4be2bedb544e346168d720aca27f47d1a5a722be91e",
+  "blk.0.ffn_down.weight": "4a691370e5a61fcbbf540fbcbf4c0f1d15dec0364528c0e916d0744f6262b63b",
+  "blk.0.ffn_norm.weight": "0c00af2b4a3128bec64a0cbb1084b042fdbe13d9ad0d03bd577f9449dfead338",
+  "blk.0.ffn_up.weight": "b32b52f790c1c083bfb8a3126dc1111cfeeb28dc8c584a930a1e5334cb176bf4",
+  "blk.1.attn_norm.weight": "68748011503c6c029e8e69a84a8e5a89338f378769627b6dbf7f93d715c292e1",
+  "blk.1.attn_output.weight": "2267344add13b048ca59e4377c86dc512be8046a57156901fa32a20fa74e4ee0",
+  "blk.1.attn_qkv.weight": "9109d2e3d7a2eacfda5226587b8be124a3bf44b972da7ebb17aa15795897eacc",
+  "blk.1.ffn_down.weight": "d675df4df4dd039c0c339ad6445d39eddd2004db6bf35bed6314c7497245a633",
+  "blk.1.ffn_norm.weight": "3b5767ae977bc8baaa06b06efdbea193b6b3ba605ce76d77a76ce317e935500c",
+  "blk.1.ffn_up.weight": "80dfd6d9d234b00334c89b8e0a02f81899c2efd377321c34ba5ba51a5f61b5ff",
+  "blk.2.attn_norm.weight": "6a6743b057e5088f145bc179e92c9bfb41163e7295d7b81c62e23dd89d2b59c4",
+  "blk.2.attn_output.weight": "bc5491ea54e0db81462d7d9b7d25cbdda380c2db8de041bd1c4ab7b76a1d19c3",
+  "blk.2.attn_qkv.weight": "a61287a9852e2f5aca9c100b471d98398b2913a3497c743de3c70ec9ddd7087f",
+  "blk.2.ffn_down.weight": "4fddcc382c8dceeab027fe43d8d44e67edb5e8ce4b9a1b7f773c87770380ade1",
+  "blk.2.ffn_norm.weight": "07e05f82b3f63f711db3b684ca79aed25c0657917e66f88af47348a82065c227",
+  "blk.2.ffn_up.weight": "4835a682ef1826c12df01ae7663fc45f9c82bc8e64b665f13fb7da8e201ec0fb",
+  "blk.3.attn_norm.weight": "f22aba7c03999ba7136f39cda747a39715e498699dc1716cd97fc5dfc58d1b1c",
+  "blk.3.attn_output.weight": "53b579855366fd786c5126b2b30aac4d583ca7bda56833c4865f5cadb5c18c6d",
+  "blk.3.attn_qkv.weight": "bb56aba78158123140fcea59c69ac562ca208f6d3086819417cdad8c50f333ad",
+  "blk.3.ffn_down.weight": "97280897a7cd86db2830c004bccc5bc094f50e293baded0189159a2019145a6e",
+  "blk.3.ffn_norm.weight": "10a8c99f8b57a960e8e0a1133c4a26f9148403d1b9bff2eff114917de996f3b5",
+  "blk.3.ffn_up.weight": "7324046c915e75d621b2043597a245a428d8eea31869135e6257a861491d8dcc",
+  "blk.4.attn_norm.weight": "507d8e164de94646edbfe33def8e8fbf7c9a6ee3fbaedb5000f72d9f51ec5e36",
+  "blk.4.attn_output.weight": "bbb3429e6efa98c150e0fdbf48c16180cbf0d0cbc1b3c253c6c319d78f4593a2",
+  "blk.4.attn_qkv.weight": "b95ee5be0786d3901273d806c339fe6c20e6bfffd2a20672a9f56af80921e8ab",
+  "blk.4.ffn_down.weight": "806bbf91df92a5a22bd5aa1ffb7fc2869f7293ffc7704771c290ecc583b27975",
+  "blk.4.ffn_norm.weight": "cfc2930a81df7aee3a5e7f726a15c1182233e868bf0d9d37f6b6ae6d8c15c234",
+  "blk.4.ffn_up.weight": "c3390c69533de2c8424e8069323ccc5d0c4543111535da04cf2c7d26745576aa",
+  "blk.5.attn_norm.weight": "0d71c4fbcefabbd021569442853d2fe90668b19409ae2805a718a829ca60beab",
+  "blk.5.attn_output.weight": "10ebd93629112bf2df5c30dd0953a4a5e9020306768283181ed426934d47e14f",
+  "blk.5.attn_qkv.weight": "5cb05633369f12d4b00e0ff787736bd846856682115720ebc6cce05270c334f6",
+  "blk.5.ffn_down.weight": "e28bcc5094212eafc7476dbc5b7a520d25b79578cbf4229d698e2655956a80ad",
+  "blk.5.ffn_norm.weight": "b6f2c4cf9f34bb4d59989f96165c14a67dc1e266ad0a6d0fcc49f1add929e6ff",
+  "blk.5.ffn_up.weight": "0f9ef99423cc07ebedc0e9cfa95809f2d7108d910bb4ef97ebc0b0309c440750",
+  "blk.6.attn_norm.weight": "b3edcc47a42218234f7564d7470611b49401a41ae8cd42123f86557c69f5d7f2",
+  "blk.6.attn_output.weight": "eb9b7d257b388bb5b8fe0515e5c6873317239cb94cda236e4b6ada2a6c57c65c",
+  "blk.6.attn_qkv.weight": "eb968081f478c52f07bd9c2761741e982dba33cc4eeadeea3557d391b9ac2106",
+  "blk.6.ffn_down.weight": "1b8588bb7463206290322695577dcfced300895d6e6f4b26966c53a9ae2f0f84",
+  "blk.6.ffn_norm.weight": "1219c04b7770983c77814200eefe743f46d15328ea2b12711e44f8103eab08d3",
+  "blk.6.ffn_up.weight": "197ef287239fec47c55677f0fbb66eaf0644f775bc382de843971730721394f6",
+  "blk.7.attn_norm.weight": "b630ad08c80d564ed1c024384818e9fd3f22a36cd7a14aa96e7e2759a8285099",
+  "blk.7.attn_output.weight": "970255aa750828a47d6b9d399f9612b5bf25aefe7dadbcba41fc416d0d4067c1",
+  "blk.7.attn_qkv.weight": "ebb157c880293e6de8d629f263ba8853ed1dbdc02c311d43432bb8cfbb310739",
+  "blk.7.ffn_down.weight": "24bcd4db4cba844c89f878b81843c373dbbc0675e889d32c5b12e63384a7b670",
+  "blk.7.ffn_norm.weight": "b9c6f71001808ee873ce7db8056e4b53fb4cccec8b7f0f312899b575fae39d39",
+  "blk.7.ffn_up.weight": "979f1828d227455c26015a2a11afe9dd05f2bb97a8ba6b38c8dab3f50e627401",
+  "blk.8.attn_norm.weight": "4e8e347e3775010b7112ee630f2f4f2383be7ff64e6ca6154b9b22566552eaa6",
+  "blk.8.attn_output.weight": "65a44babf44a435a1829945211b3168f9ec78ac3cb7a049a733e93d11f0d6659",
+  "blk.8.attn_qkv.weight": "343ed07671da400b040812a4058482fa38284b5d9af9becfed07417fe26ce747",
+  "blk.8.ffn_down.weight": "7fb7e073e3c2c503c4e9d60efa0988fed7398d900cc003695fe3fffd3e188b82",
+  "blk.8.ffn_norm.weight": "b07c1f655d8593e3892a2cf73f8a0c19ce8e5cb613fafbe7cbd430da8ce4c57d",
+  "blk.8.ffn_up.weight": "8b26e14de54b3fdc2e2d3ea41720f9d9c236a93688c3b7fd7bf43f5fbb327c9b",
+  "blk.9.attn_norm.weight": "46394d408a8e316916177e6aa261de32e137a82d729c0b1800b072f0c38c39b6",
+  "blk.9.attn_output.weight": "d57f3d46107947a7073373a0b35d6ecf7759b5df15406f4a3590a60666af6b16",
+  "blk.9.attn_qkv.weight": "14bb8ace8c5453148f4b536e9f4279c813f31136716947256f5cca333448639c",
+  "blk.9.ffn_down.weight": "2b8d98e2b5ed68338f6e4de43bf7de0c4858cc69103cd5177725f7444eec7694",
+  "blk.9.ffn_norm.weight": "41a499dfd418cc4c6b8c12313f673f7e2cd4a3f9c4065eb6c4feb5eed02fb542",
+  "blk.9.ffn_up.weight": "143aab7533a64b17fbe201490a6f674bc7f0bd370c094500b2e100419073d1c2",
+  "blk.10.attn_norm.weight": "ebb670aafd36816a794347287269d8f1a5b19c1e3c0a1e38023bc19fdba9b073",
+  "blk.10.attn_output.weight": "b5d65bbc0ed5e49fdd9d754bc18163cd042a285024d0cf6f954c503bc8c877cb",
+  "blk.10.attn_qkv.weight": "f06b15bac88da798fa34a62b03eaac0dbe8b846020516603c387541f2d8dd672",
+  "blk.10.ffn_down.weight": "fb091fcd1b4de25d1bea94d1755e255cb02914a030d23e3a234e57b8d46bde6e",
+  "blk.10.ffn_norm.weight": "eb347bdf9c40414af87e13a8e72e40b31f004b50f7cb366f1a219ced60a61355",
+  "blk.10.ffn_up.weight": "ed2d52fc881a173f404fe8a1067862c9856d6c3e0d2e90a330a7aa394e3f84d1",
+  "blk.11.attn_norm.weight": "64e252603cf010a0e502ca39fdf8d0a196a79aec67c0d2bb9213fc0cb80c47d4",
+  "blk.11.attn_output.weight": "228e33e21c69f52efc74fdfc831bc9af271e44b2a29a3dced1d64e667ce36eb5",
+  "blk.11.attn_qkv.weight": "ab9ce6d4ef9e42ee0da3f20a7708a3bbc5e79e967b05fa86ba946a05e2eb63eb",
+  "blk.11.ffn_down.weight": "0ca133b7835c98dc77c25d64e4eb7873778bdb5e4d22d8b80f920f46865b43bd",
+  "blk.11.ffn_norm.weight": "02455741a0dfd161c79aa1ecc381901721f229fdcda5615622a629631fb61cfd",
+  "blk.11.ffn_up.weight": "9fecdcc099fbb8e23c6b1ea9294702a027f4a58d265543ec5e7be79b8f63b354",
+  "blk.12.attn_norm.weight": "783bb459911b1b3609a9b2bdfe272f1670add73b5471da738e07ac47e2e07dfd",
+  "blk.12.attn_output.weight": "1e1a914c9e48b857206ac5a1f7cead994bc1ea91d5d4fff8c834d73f2e38ef5d",
+  "blk.12.attn_qkv.weight": "5953e7185ccb87fb4dae8f9426ec86315d4c7794326e8ab59b3a95d4af2189f0",
+  "blk.12.ffn_down.weight": "a3eecf0f394f86e2cfb48a5940a5c50ca86d71883b2f79fcc642a935fabce0d4",
+  "blk.12.ffn_norm.weight": "0a4272e41373c23bd72f10d2d82930aa3a1480aac75832bfbf01cebf0b86b6a4",
+  "blk.12.ffn_up.weight": "06f42776de3a7ceac3025f26a7a8bd20e062233cce2bdaa2183470dc4b30b87d",
+  "blk.13.attn_norm.weight": "5915da60fb03e201fa649faba780e5fdf1c761c262b206e5415cf83181f65780",
+  "blk.13.attn_output.weight": "4dbf6eab074fa3835fd32bd631a8208e511037d5056d2fd3015735cca7674ef7",
+  "blk.13.attn_qkv.weight": "d3d8339a1c4782d9e73d77fdebe154d3c5b83ac40c9175b3e91a4977d08f876b",
+  "blk.13.ffn_down.weight": "de6772b46a55e1fd42b007637dfbf68b6598e5d5b61622da0935002e1e192d3a",
+  "blk.13.ffn_norm.weight": "5a640ea3b8c7be49c95a58a2327e10d8e8d9d142504bde5c8091613e5b961d7a",
+  "blk.13.ffn_up.weight": "f35e3545e4bd3531b2e843b5efd31dee0c13c807ee6386e65473ba67bbec30d0",
+  "blk.14.attn_norm.weight": "9b34986450b7c98b4927e81e61a816f9e84b1addc7c14926402100037aad6678",
+  "blk.14.attn_output.weight": "155d52efb23d366016d861a251d4d1f4a0c13699188c50d50dba016a0d8bfcd9",
+  "blk.14.attn_qkv.weight": "8e1415084e1f33c73a777f19e752489f4dd312cca047733e5ea643cd4a955e04",
+  "blk.14.ffn_down.weight": "a2a142226b94baa01ccb65bdea2b7418e49085c1d9c3c63e544e3112c58a25da",
+  "blk.14.ffn_norm.weight": "8aecfd9b0ae6affaea31a80c5c9a4a14b31deaa0db7bd8f6da2a64d23447921c",
+  "blk.14.ffn_up.weight": "0c1407237b8c1bd02f193346b5681926fe698a5055eac6a7450451b0f991707c",
+  "blk.15.attn_norm.weight": "e037bd19880bfa83d983200fb0c7866f8ad16c3ff5cc4b4f3a37ca7373870ff6",
+  "blk.15.attn_output.weight": "045fe4fc95cc129a1b92771b179c11b12845c4c088786c607f17bd98857e68e1",
+  "blk.15.attn_qkv.weight": "7621b7559705cab1d4dea1c69f76dbf9dc1c8837a203b656f484703b9c1b70ce",
+  "blk.15.ffn_down.weight": "7e5ac20e290bc60761e1cd972354fde225b7fa861048d44d9a0dd9b046d55f58",
+  "blk.15.ffn_norm.weight": "b6d830d88f1db1825687973c8c2b1a24c6fa84f07af8d0e3ef9c86009baca0b2",
+  "blk.15.ffn_up.weight": "dcda0957cd04fc45476774dba2bbf9aa89d6b05d5ca7b10ae6f73ad2c49b1cd3",
+  "blk.16.attn_norm.weight": "4ee9b70ba15cb2a08240f93990e90f5068c48fceb481f8e2186bec8b7214eb3f",
+  "blk.16.attn_output.weight": "315cfe5536658d2498192b2980eade15b2c9a4ff220e4011911457b1727fa103",
+  "blk.16.attn_qkv.weight": "3c8122e3ad637583b9dcde8ff3a323267d3014bb1f0f9771e5322260ca9ecc8d",
+  "blk.16.ffn_down.weight": "3b5fbebd5ee2b86cad96fb8a9b45a8770d08f82c1c8b74d7061e866f7020a18d",
+  "blk.16.ffn_norm.weight": "ffab69f20bda372de6e5878f0539163e2fc6ba113621ded95705fc3b1465c9f0",
+  "blk.16.ffn_up.weight": "0935ea3d258da42d6258406365f39f58ddaabfe97ea5977580db3635188f24a1",
+  "blk.17.attn_norm.weight": "f030441733f3d147b4a06a1eb4aeb8465c7c24d9c53bf4c48fe7e134d3629803",
+  "blk.17.attn_output.weight": "07a955ef09e8dc766ac0df647d0b2c69f23c4c69a7137654b4aad80303ed0eda",
+  "blk.17.attn_qkv.weight": "1c10688061e21e2fe12ad0cb54bf03895c1f83c3b0df743a42f548b52cbca1b2",
+  "blk.17.ffn_down.weight": "ebb9cc9836f41d88fdae2aa9a4355514e4edaec8d1577ffeb947a35204e77f52",
+  "blk.17.ffn_norm.weight": "50aff44f6528b13db5389f2ddcdb7676244947610bd7ffbff3f881c968c2a0d4",
+  "blk.17.ffn_up.weight": "d716537949582be33bde6b02e38f5a70081c9642a9fb05a61312126718b8d148",
+  "blk.18.attn_norm.weight": "0ea695c4e53d637902f46663a6ee42adc493c36794476acc7dbddaa05b13840d",
+  "blk.18.attn_output.weight": "5fd35b500221a612eb4f4bddf0e9b6b7db4d7733032a75f8802fb2d884647c2e",
+  "blk.18.attn_qkv.weight": "b0da37fd030fe69581f990bf23bfd35467a1bbe558af6de7c0924f6b72e92317",
+  "blk.18.ffn_down.weight": "b355c33f44b328f4bb977567de8f7544db4b005d7a8fbded658518ecf3c5a153",
+  "blk.18.ffn_norm.weight": "58b3fe9094079989a86e0387143259e1cc35952d24dc3df290c4ba6df44f5c51",
+  "blk.18.ffn_up.weight": "2ce530954c342c30ed2ead5353f931960bfae1d278868504c0efb973560fabbe",
+  "blk.19.attn_norm.weight": "533e9aed66feea8f0392aa81f9e293240e1f009a5334253915fb60c2749b615d",
+  "blk.19.attn_output.weight": "84f2d00f98a4113a779d3b5d1c3e7c914eb47784d3ab13b290367c124c2994aa",
+  "blk.19.attn_qkv.weight": "fbe6b9f53b07fa7537d3b3d452d20a9bc666f9fd41ec2091dd28bc2f70fc668f",
+  "blk.19.ffn_down.weight": "b30199e098c8bb3f890183d8b18471e80b62b604729b277ad62488dd71e1206b",
+  "blk.19.ffn_norm.weight": "c81373e41cd340b7badb19f9517c77c4250b4eb9a02dc758b8b49b652487d7ff",
+  "blk.19.ffn_up.weight": "5a5cb083ca7725720e3a890f7fa46354760e8007a8188849a092e305694a75e3",
+  "blk.20.attn_norm.weight": "4953091b4477e354357a8e743ba0a1900633e52f1599ee082a0c9b0b2b5cd978",
+  "blk.20.attn_output.weight": "62d54f7749cd6856097b2632066a322b0296df915fe66f382c5b5981be0d4f23",
+  "blk.20.attn_qkv.weight": "406de9e35b0729ebe902d7a47905cc7fb29a921431ed35dbef0c03e5690a1329",
+  "blk.20.ffn_down.weight": "62fb678b0d1261e19a4903a2b347d67afcc8acff01feb33a687a35a2d1e6f9a5",
+  "blk.20.ffn_norm.weight": "cd9d36b7e71e55c8925b97bb09c28219f182626bcff094878ae39c3db887a14b",
+  "blk.20.ffn_up.weight": "b9276771d79d3e932e73ccc520c3f8476342b9ef312ed2ee1e0da822e6e3ad18",
+  "blk.21.attn_norm.weight": "66d8c8a35e13ce9c2a0e75b670150e2c31484a55c2316df46075312196178ed3",
+  "blk.21.attn_output.weight": "12ab46c9382648f9b3350fdd92a6be6352743d62d6b520d7e2024e0c838588f5",
+  "blk.21.attn_qkv.weight": "a7909676ee1675ca23cd29a5fdd226df8dd9d68f94c6c9bbb51dd9fd38504008",
+  "blk.21.ffn_down.weight": "6fb317279c6542e82f97d5a12a60fac1bd0fa0405154f9fbe265e2fe39bd49cc",
+  "blk.21.ffn_norm.weight": "c0f703eb3ff161b5ba4490d87d8684b8a6c47a8f433e12f418333b9db439010a",
+  "blk.21.ffn_up.weight": "6dbdb80ef0c35e364bbce12d40d5e74c7963c7b55d58d9579567a07ffce7b863",
+  "blk.22.attn_norm.weight": "f94237433bf03d675cb2f655b81ca91a1ce2447bc6b00b13d6b0ccfe2d411eff",
+  "blk.22.attn_output.weight": "e821f95995ce497c01e63ca64f737713b1b65f11df1903e51d444aa516f33f71",
+  "blk.22.attn_qkv.weight": "1b0f717c73afb5eb4c82a1708c4e85c969e8a2a8770d9ddb78b1870a2d8a781e",
+  "blk.22.ffn_down.weight": "0f33f7a3cdc685484be99aa0c03642b0b20850a27d1fddbe054b13a9382f3ccb",
+  "blk.22.ffn_norm.weight": "9df285cf211ddd7df2b36a50489af574755c7d4d98b29a05cd04566ae613c8dc",
+  "blk.22.ffn_up.weight": "63ac300e1efb34041dd0136cf43ea622fac6f0caccce1cd9262f5e08d2cf179c",
+  "blk.23.attn_norm.weight": "5f72d9e88689b4027b28f5f8f26cd3abb03635ceea7ec98a4c91a9fc691f6707",
+  "blk.23.attn_output.weight": "6ecf04ff61125c5fc768f8656497152149373daf321ee9c957e8f7245a1184d1",
+  "blk.23.attn_qkv.weight": "a9d9978806724c2959f2cf386c233831f08e1e933dbf2b32665e788d9d512ea4",
+  "blk.23.ffn_down.weight": "72c7d17886a3da17fa0daa456aa5e877b2ef5b8b403182b870d9ca5ca9c70347",
+  "blk.23.ffn_norm.weight": "971e4b712e3025a13419b5b57d674b5e4ab7f18f74b57b9afc4671623da90c4b",
+  "blk.23.ffn_up.weight": "df2b5c7dbd5834545b815073af0c7355b065124e6d6f0fee78d8fa5b2076dc3e",
+  "blk.24.attn_norm.weight": "c41957c4a79ad3b16f6e11daec1c7f530b9f3f4b618e1e4367c3b67787ac4ab6",
+  "blk.24.attn_output.weight": "ef7d61f5fc88ac6f31bf60cb5f4d2d6b8df42d38825807112361a7224b0dee3b",
+  "blk.24.attn_qkv.weight": "3e6a58fe7d49c90bb6971efbad3371c32256881173ea5aee4b0c296cb206490f",
+  "blk.24.ffn_down.weight": "f43619144047de42fed81dfa495f1815d3cb771330e574043e2b67620819292c",
+  "blk.24.ffn_norm.weight": "5501d4a2a98c8ca6b42e77b53b221dbc08f530f6a067256d787534ec6fe028bd",
+  "blk.24.ffn_up.weight": "d64c8b0e509e2b1118f6000176f8956cacecdbb200c7e95ed93fb78b6e26c84a",
+  "blk.25.attn_norm.weight": "502fa3c302d371f61c5791f4615b73018ffb1daa09b6499b227116581244c5d4",
+  "blk.25.attn_output.weight": "ad8391d4e9c980856f2547aa945b2b6a407a6382158dc1ddd4f08d94ecc24be6",
+  "blk.25.attn_qkv.weight": "42e8983780d4a01a02c54ad23d4df21eea437f119a10af5a9c12a76a42d308c1",
+  "blk.25.ffn_down.weight": "302dd010d4e0ab4eeaee89090409ea0dddeeeed3236415eb8f97c942497eea91",
+  "blk.25.ffn_norm.weight": "fb34c1ee5bca96986c08834df0a0c047ba041c1123ac1f563e9d64312bf82d6a",
+  "blk.25.ffn_up.weight": "10739a8de156816d93c92b935386540bfa976bdbef204f0312960f6fc657582f",
+  "blk.26.attn_norm.weight": "7036c711609128c4e55968ff3681d3043338879a5737efd6c2ac9e1a2a61f1a0",
+  "blk.26.attn_output.weight": "db5db45dead5cb911fa01da59832f121b7c18b2d167bf53741c40819f24d346c",
+  "blk.26.attn_qkv.weight": "cae34c6b7f82ed14348d5ed30a79919c383737c1694a9cb9c0de609d3b0c1d0a",
+  "blk.26.ffn_down.weight": "491ec3a4da9b4f49f8ebc6be658ce397a9b801ae9fb35e82177e47808c65e5d0",
+  "blk.26.ffn_norm.weight": "fd7059d75d7f0e5288511ddeeb0f772eb3cae3ccfe4226b877015834edc3c386",
+  "blk.26.ffn_up.weight": "ea1ee1274c56458ce056d2205e5bb6e5422ce4cb0ad58006b8141749b97a0c39",
+  "blk.27.attn_norm.weight": "cc362c9a937609265052cd38544af17a1a7448cea086d4c801139e1fc865832d",
+  "blk.27.attn_output.weight": "ba757a81dabde9cb1b069d1bb616fe79649a1724f756567ec61caed1304fe6cf",
+  "blk.27.attn_qkv.weight": "1ab8d7d02d87756c12c2275636823aa5ede3d683178225c4cac4bd892c319bd4",
+  "blk.27.ffn_down.weight": "deb1c711c8a66acf4dcd2d088e1548f8e08f296f755e4067d6557fa55afde88c",
+  "blk.27.ffn_norm.weight": "fc6242d8cb8a4a37a8ddb7e41e7e60a63d4a89edf36acb35df052f10b9c91ece",
+  "blk.27.ffn_up.weight": "8df39b09c4801f343aca78f2918a1f6db78c8c55e591eda4c69eadb74c26e180",
+  "blk.28.attn_norm.weight": "75b539308f77e3cefdc6d98484d8b5cbf0538f0c2869a77b7373a145a18bc850",
+  "blk.28.attn_output.weight": "ae128940eb60a6d2e121762ef4b3e9dcf9eb3e105b249507fa7f12de0e19822c",
+  "blk.28.attn_qkv.weight": "bdda781c288e9326c240e33905f8e621b6a2ad902e620739d34f93fcd6f933de",
+  "blk.28.ffn_down.weight": "f1d6e6d1c286b1138bfd7e53fe477f399ae93bc2c04e35416f84218ed7247965",
+  "blk.28.ffn_norm.weight": "3f837ce82c8b9bde0d61d08b6f5fe5574886ea5328dbdc53f2929f18da8b4087",
+  "blk.28.ffn_up.weight": "2af027002e31d1b6cfedbdb30a2b9d7213f3aa691167c353913adfd48fda31e4",
+  "blk.29.attn_norm.weight": "61e8003b5329462ffe0fe172f2b160260de006aed858332d49d75504b6b6aa7a",
+  "blk.29.attn_output.weight": "ca44542a72a37476dc73dbdcc01f5b7497cb3ebc4ea230a55c9634ccd8e56ad4",
+  "blk.29.attn_qkv.weight": "abb3d9d6abe57872ae3daa51935d43264093ded5ce63b49d1e280ee5758be0e4",
+  "blk.29.ffn_down.weight": "6764b895fce881df097489c263446f0106de36217997660c15984b3ee22a5a06",
+  "blk.29.ffn_norm.weight": "89e03e9a33fc0e6e31ba9f0c2bd7c5734a118c5602bb90148793e08a80e8d0ae",
+  "blk.29.ffn_up.weight": "fa7ad57a84954f4121653152efed1a871d8adb20a1ea9086e3e849ce359d7d2e",
+  "blk.30.attn_norm.weight": "91a697aca1e42af54f806a20211031c3369e8d0bd58df1b0147fe24954e1f5a4",
+  "blk.30.attn_output.weight": "36063fcf766c89ac75be56f688cc63cefe5f2c733fbf4378ea9956ad386fa148",
+  "blk.30.attn_qkv.weight": "2cacd1161f1121a2c0b979930134f4666f73fb8d7237b3b0659ae091b15955a6",
+  "blk.30.ffn_down.weight": "9f3fcb6217100595850c05dc98f9ab2a263afdb6ab28df2fcb08aeff512057d7",
+  "blk.30.ffn_norm.weight": "6c600bc1fc7de39d4f8917b81fc7d1d5ed2a9b56492234c13a4bd6028c30d880",
+  "blk.30.ffn_up.weight": "73cabd1bb011956b2689ea3338bb76642ef3a57c197377d666d2ab5f56317668",
+  "blk.31.attn_norm.weight": "72d3e1cc771380645fa75a899858c95f39857a4f3f1ed60fe1578df383b8bc53",
+  "blk.31.attn_output.weight": "40089cdd29994dc19a1d89fa15902a89cfeca3540f12dc9bf4d00ef82506e456",
+  "blk.31.attn_qkv.weight": "1d0bb40e9258071ae14290a53c619a8e331dda07354d2a02ef45766c029ae5e4",
+  "blk.31.ffn_down.weight": "8defa0e06335b793fa8be03883f0a322d6c5b33f52c69c943c35c60d16e42c0a",
+  "blk.31.ffn_norm.weight": "33c55d9d0c496ccfb130361fe131649346e098abaaac39c0519507e5d846721d",
+  "blk.31.ffn_up.weight": "599f6503f61c692c1f82001973d35119f9688db5e6be9d9c298411491c93f09b",
+  "output.weight": "14b8dc662bfa3308ebb2e102c562d8e52c15670e538f20f3216a9c310ca9dd41",
+  "output_norm.weight": "7f2294ba94ce65681df6c7ddd8698799199b9d77dc83c10bdad5c3999f0fdb82",
+  "rope_factors_long.weight": "e34d378664e354652c38f47d10dafb0498ccc2fb042d39ff7fef768146fff22b",
+  "rope_factors_short.weight": "9379146a4988f373d362fe47b06c75e7fe7c54aa4dc9558758df79b7a87471fd",
+  "token_embd.weight": "19a03c1fb5ac0baee93b0a7d8b0f26e9a9b011e229b694afc50ebfc13d84f8bf"
+}
diff --git a/llm/ggml.go b/llm/ggml.go
index d7f2eef7..4c68adf9 100644
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -157,6 +157,14 @@ type Tensor struct {
 	io.WriterTo `json:"-"`
 }
 
+func (t Tensor) block() (n int) {
+	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
+		return -1
+	}
+
+	return
+}
+
 func (t Tensor) blockSize() uint64 {
 	switch t.Kind {
 	case 0, 1, 24, 25, 26, 27, 28, 30: // F32, F16, I8, I16, I32, I64, F64, BF16
diff --git a/llm/gguf.go b/llm/gguf.go
index 98158313..2e6bc542 100644
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -532,15 +532,14 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
 		}
 	}
 
-	slices.SortFunc(ts, func(a, b Tensor) int {
-		var i, j int
-		if n, err := fmt.Sscanf(a.Name, "blk.%d", &i); err != nil || n != 1 {
-			return cmp.Compare(a.Name, b.Name)
-		} else if n, err := fmt.Sscanf(b.Name, "blk.%d", &j); err != nil || n != 1 {
-			return cmp.Compare(a.Name, b.Name)
+	slices.SortStableFunc(ts, func(a, b Tensor) int {
+		if i, j := a.block(), b.block(); i < 0 && j > 0 {
+			return 1
+		} else if i > 0 && j < 0 {
+			return -1
+		} else {
+			return cmp.Compare(i, j)
 		}
-
-		return cmp.Compare(i, j)
 	})
 
 	var s uint64

From aec77d6a05c3cd13732eab7decc9794bbed670d9 Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Tue, 2 Jul 2024 14:40:01 -0700
Subject: [PATCH 34/71] support new "longrope" attention factor

---
 convert/convert_phi3.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert/convert_phi3.go b/convert/convert_phi3.go
index 7aa3ed15..0f645217 100644
--- a/convert/convert_phi3.go
+++ b/convert/convert_phi3.go
@@ -58,7 +58,7 @@ func (p *phi3) KV(t *Tokenizer) llm.KV {
 	switch p.RopeScaling.Type {
 	case "":
 		// no scaling
-	case "su":
+	case "su", "longrope":
 		kv["phi3.rope.scaling.attn_factor"] = float32(max(math.Sqrt(1+math.Log(scale)/math.Log(float64(p.OriginalMaxPositionEmbeddings))), 1.0))
 	case "yarn":
 		kv["phi3.rope.scaling.attn_factor"] = float32(max(0.1*math.Log(scale)+1.0, 1.0))

From bd5e432630a0c1d1ca5795052355a45014e71a2a Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 5 Aug 2024 10:30:32 -0700
Subject: [PATCH 35/71] update import.md

---
 docs/import.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/import.md b/docs/import.md
index f34f09ac..82ea9ba5 100644
--- a/docs/import.md
+++ b/docs/import.md
@@ -16,7 +16,9 @@ If the model being imported is one of these architectures, it can be imported di
 
  - LlamaForCausalLM
  - MistralForCausalLM
+ - MixtralForCausalLM
  - GemmaForCausalLM
+ - Phi3ForCausalLM
 
 ```dockerfile
 FROM /path/to/safetensors/directory

From 8b00a415ab5170a5a75b105402ca262d1fb7ac12 Mon Sep 17 00:00:00 2001
From: royjhan <65097070+royjhan@users.noreply.github.com>
Date: Tue, 13 Aug 2024 13:19:56 -0400
Subject: [PATCH 36/71] Load Embedding Model on Empty Input (#6325)

* load on empty input

* no load on invalid input
---
 server/routes.go      | 16 +++++-----
 server/routes_test.go | 70 -------------------------------------------
 2 files changed, 9 insertions(+), 77 deletions(-)

diff --git a/server/routes.go b/server/routes.go
index e5a31002..6c470c17 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -324,13 +324,10 @@ func (s *Server) EmbedHandler(c *gin.Context) {
 			input = append(input, v.(string))
 		}
 	default:
-		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "invalid input type"})
-		return
-	}
-
-	if len(input) == 0 {
-		c.JSON(http.StatusOK, api.EmbedResponse{Model: req.Model, Embeddings: [][]float32{}})
-		return
+		if req.Input != nil {
+			c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "invalid input type"})
+			return
+		}
 	}
 
 	r, m, opts, err := s.scheduleRunner(c.Request.Context(), req.Model, []Capability{}, req.Options, req.KeepAlive)
@@ -341,6 +338,11 @@ func (s *Server) EmbedHandler(c *gin.Context) {
 
 	checkpointLoaded := time.Now()
 
+	if len(input) == 0 {
+		c.JSON(http.StatusOK, api.EmbedResponse{Model: req.Model, Embeddings: [][]float32{}})
+		return
+	}
+
 	kvData, err := getKVData(m.ModelPath, false)
 	if err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
diff --git a/server/routes_test.go b/server/routes_test.go
index ef7248ef..242875d6 100644
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -272,76 +272,6 @@ func Test_Routes(t *testing.T) {
 				assert.Equal(t, "library", retrieveResp.OwnedBy)
 			},
 		},
-		{
-			Name:   "Embed Handler Empty Input",
-			Method: http.MethodPost,
-			Path:   "/api/embed",
-			Setup: func(t *testing.T, req *http.Request) {
-				embedReq := api.EmbedRequest{
-					Model: "t-bone",
-					Input: "",
-				}
-				jsonData, err := json.Marshal(embedReq)
-				require.NoError(t, err)
-				req.Body = io.NopCloser(bytes.NewReader(jsonData))
-			},
-			Expected: func(t *testing.T, resp *http.Response) {
-				contentType := resp.Header.Get("Content-Type")
-				if contentType != "application/json; charset=utf-8" {
-					t.Fatalf("expected content type application/json; charset=utf-8, got %s", contentType)
-				}
-				body, err := io.ReadAll(resp.Body)
-				if err != nil {
-					t.Fatal(err)
-				}
-
-				var embedResp api.EmbedResponse
-				err = json.Unmarshal(body, &embedResp)
-				if err != nil {
-					t.Fatal(err)
-				}
-
-				if embedResp.Model != "t-bone" {
-					t.Fatalf("expected model t-bone, got %s", embedResp.Model)
-				}
-
-				if embedResp.Embeddings == nil {
-					t.Fatalf("expected embeddings to not be nil, got %v", embedResp.Embeddings)
-				}
-
-				if len(embedResp.Embeddings) != 0 {
-					t.Fatalf("expected embeddings to be empty, got %v", embedResp.Embeddings)
-				}
-			},
-		},
-		{
-			Name:   "Embed Handler Invalid Input",
-			Method: http.MethodPost,
-			Path:   "/api/embed",
-			Setup: func(t *testing.T, req *http.Request) {
-				embedReq := api.EmbedRequest{
-					Model: "t-bone",
-					Input: 2,
-				}
-				jsonData, err := json.Marshal(embedReq)
-				require.NoError(t, err)
-				req.Body = io.NopCloser(bytes.NewReader(jsonData))
-			},
-			Expected: func(t *testing.T, resp *http.Response) {
-				contentType := resp.Header.Get("Content-Type")
-				if contentType != "application/json; charset=utf-8" {
-					t.Fatalf("expected content type application/json; charset=utf-8, got %s", contentType)
-				}
-				_, err := io.ReadAll(resp.Body)
-				if err != nil {
-					t.Fatal(err)
-				}
-
-				if resp.StatusCode != http.StatusBadRequest {
-					t.Fatalf("expected status code 400, got %d", resp.StatusCode)
-				}
-			},
-		},
 	}
 
 	t.Setenv("OLLAMA_MODELS", t.TempDir())

From feedf49c717a449cedbf973b06ca97796cfaa004 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Tue, 13 Aug 2024 11:44:50 -0700
Subject: [PATCH 37/71] Go back to a pinned Go version

Go version 1.22.6 is triggering AV false positives, so go back to 1.22.5
---
 .github/workflows/release.yaml | 10 +++++-----
 .github/workflows/test.yaml    | 10 +++++-----
 go.mod                         |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index f0c6db5d..5ae630c3 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -31,7 +31,7 @@ jobs:
           security set-keychain-settings -lut 3600 build.keychain
       - uses: actions/setup-go@v5
         with:
-          go-version: "stable"
+          go-version-file: go.mod
           cache: true
       - name: Build Darwin
         env:
@@ -87,7 +87,7 @@ jobs:
           write-host "plugin installed"
       - uses: actions/setup-go@v5
         with:
-          go-version: "stable"
+          go-version-file: go.mod
           cache: true
       - run: go get ./...
       - run: |
@@ -141,7 +141,7 @@ jobs:
           write-host "plugin installed"
       - uses: actions/setup-go@v5
         with:
-          go-version: "stable"
+          go-version-file: go.mod
           cache: true
       - name: 'Install ROCm'
         run: |
@@ -218,7 +218,7 @@ jobs:
           write-host "plugin installed"
       - uses: actions/setup-go@v5
         with:
-          go-version: "stable"
+          go-version-file: go.mod
           cache: true
       - name: 'Install CUDA'
         run: |
@@ -306,7 +306,7 @@ jobs:
           write-host "plugin installed"
       - uses: actions/setup-go@v5
         with:
-          go-version: "stable"
+          go-version-file: go.mod
           cache: true
       - run: go get
       - uses: actions/download-artifact@v4
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index a57d45fd..3d58fa3e 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -63,7 +63,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions/setup-go@v5
         with:
-          go-version: "stable"
+          go-version-file: go.mod
           cache: true
       - run: go get ./...
       - run: |
@@ -163,7 +163,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions/setup-go@v5
         with:
-          go-version: "stable"
+          go-version-file: go.mod
           cache: true
       - name: 'Install ROCm'
         run: |
@@ -200,7 +200,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions/setup-go@v5
         with:
-          go-version: "stable"
+          go-version-file: go.mod
           cache: true
       - name: 'Install CUDA'
         run: |
@@ -255,7 +255,7 @@ jobs:
           submodules: recursive
       - uses: actions/setup-go@v5
         with:
-          go-version: "stable"
+          go-version-file: go.mod
           cache: false
       - run: |
           case ${{ matrix.arch }} in
@@ -297,7 +297,7 @@ jobs:
           submodules: recursive
       - uses: actions/setup-go@v5
         with:
-          go-version: "stable"
+          go-version-file: go.mod
           cache: true
       - run: |
           case ${{ matrix.arch }} in
diff --git a/go.mod b/go.mod
index 2e0c6614..6e437c73 100644
--- a/go.mod
+++ b/go.mod
@@ -1,6 +1,6 @@
 module github.com/ollama/ollama
 
-go 1.22.0
+go 1.22.5
 
 require (
 	github.com/containerd/console v1.0.3

From 1f32276178d5860bbaeafb7dd73d4ef93053bc15 Mon Sep 17 00:00:00 2001
From: Pamela Fox <pamela.fox@gmail.com>
Date: Tue, 13 Aug 2024 13:36:05 -0700
Subject: [PATCH 38/71] Update openai.md to remove extra checkbox (#6345)

---
 docs/openai.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/openai.md b/docs/openai.md
index 7b3a3f31..75d2c595 100644
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -182,7 +182,6 @@ curl http://localhost:11434/v1/embeddings \
 - [x] Reproducible outputs
 - [x] Vision
 - [x] Tools (streaming support coming soon)
-- [ ] Vision
 - [ ] Logprobs
 
 #### Supported request fields

From 2697d7f5aad27248aebbe5acff1dcbede5367b7b Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Tue, 13 Aug 2024 13:40:37 -0700
Subject: [PATCH 39/71] lint

- fixes printf: non-constant format string in call to fmt.Printf
- fixes SA1032: arguments have the wrong order
- disables testifylint
---
 .golangci.yaml       |  1 -
 readline/buffer.go   | 79 ++++++++++++++++----------------------------
 readline/readline.go |  2 +-
 readline/types.go    | 61 ++++++++++++++++++++++------------
 server/sched.go      |  2 +-
 5 files changed, 70 insertions(+), 75 deletions(-)

diff --git a/.golangci.yaml b/.golangci.yaml
index c2c8b52b..c9c9f620 100644
--- a/.golangci.yaml
+++ b/.golangci.yaml
@@ -24,7 +24,6 @@ linters:
     - nosprintfhostport
     - staticcheck
     - tenv
-    - testifylint
     - unconvert
     - unused
     - usestdlibvars
diff --git a/readline/buffer.go b/readline/buffer.go
index 68573d40..d91fe0a9 100644
--- a/readline/buffer.go
+++ b/readline/buffer.go
@@ -62,7 +62,7 @@ func (b *Buffer) MoveLeft() {
 				rLength := runewidth.RuneWidth(r)
 
 				if b.DisplayPos%b.LineWidth == 0 {
-					fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width))
+					fmt.Print(CursorUp + CursorBOL + CursorRightN(b.Width))
 					if rLength == 2 {
 						fmt.Print(CursorLeft)
 					}
@@ -74,7 +74,7 @@ func (b *Buffer) MoveLeft() {
 						fmt.Print(CursorLeft)
 					}
 				} else {
-					fmt.Print(cursorLeftN(rLength))
+					fmt.Print(CursorLeftN(rLength))
 				}
 
 				b.Pos -= 1
@@ -115,15 +115,15 @@ func (b *Buffer) MoveRight() {
 				b.DisplayPos += rLength
 
 				if b.DisplayPos%b.LineWidth == 0 {
-					fmt.Printf(CursorDown + CursorBOL + cursorRightN(len(b.Prompt.prompt())))
+					fmt.Print(CursorDown + CursorBOL + CursorRightN(len(b.Prompt.prompt())))
 				} else if (b.DisplayPos-rLength)%b.LineWidth == b.LineWidth-1 && hasSpace {
-					fmt.Printf(CursorDown + CursorBOL + cursorRightN(len(b.Prompt.prompt())+rLength))
+					fmt.Print(CursorDown + CursorBOL + CursorRightN(len(b.Prompt.prompt())+rLength))
 					b.DisplayPos += 1
 				} else if b.LineHasSpace.Size() > 0 && b.DisplayPos%b.LineWidth == b.LineWidth-1 && hasSpace {
-					fmt.Printf(CursorDown + CursorBOL + cursorRightN(len(b.Prompt.prompt())))
+					fmt.Print(CursorDown + CursorBOL + CursorRightN(len(b.Prompt.prompt())))
 					b.DisplayPos += 1
 				} else {
-					fmt.Print(cursorRightN(rLength))
+					fmt.Print(CursorRightN(rLength))
 				}
 			}
 		}
@@ -154,7 +154,7 @@ func (b *Buffer) MoveToStart() {
 				fmt.Print(CursorUp)
 			}
 		}
-		fmt.Printf(CursorBOL + cursorRightN(len(b.Prompt.prompt())))
+		fmt.Print(CursorBOL + CursorRightN(len(b.Prompt.prompt())))
 		b.Pos = 0
 		b.DisplayPos = 0
 	}
@@ -169,9 +169,9 @@ func (b *Buffer) MoveToEnd() {
 				fmt.Print(CursorDown)
 			}
 			remainder := b.DisplaySize() % b.LineWidth
-			fmt.Printf(CursorBOL + cursorRightN(len(b.Prompt.prompt())+remainder))
+			fmt.Print(CursorBOL + CursorRightN(len(b.Prompt.prompt())+remainder))
 		} else {
-			fmt.Print(cursorRightN(b.DisplaySize() - b.DisplayPos))
+			fmt.Print(CursorRightN(b.DisplaySize() - b.DisplayPos))
 		}
 
 		b.Pos = b.Buf.Size()
@@ -286,8 +286,7 @@ func (b *Buffer) drawRemaining() {
 	remLength := runewidth.StringWidth(remainingText)
 
 	if len(currLine) > 0 {
-		fmt.Printf(ClearToEOL + currLine)
-		fmt.Print(cursorLeftN(currLineSpace))
+		fmt.Print(ClearToEOL + currLine + CursorLeftN(currLineSpace))
 	} else {
 		fmt.Print(ClearToEOL)
 	}
@@ -301,9 +300,9 @@ func (b *Buffer) drawRemaining() {
 	}
 
 	if (b.DisplayPos+currLineSpace)%b.LineWidth == 0 && currLine == remainingText {
-		fmt.Print(cursorRightN(currLineSpace))
+		fmt.Print(CursorRightN(currLineSpace))
 		fmt.Printf("\n%s", b.Prompt.AltPrompt)
-		fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width-currLineSpace))
+		fmt.Print(CursorUp + CursorBOL + CursorRightN(b.Width-currLineSpace))
 	}
 
 	// render the other lines
@@ -333,9 +332,7 @@ func (b *Buffer) drawRemaining() {
 			lineLength += runewidth.RuneWidth(c)
 			fmt.Printf("%c", c)
 		}
-		fmt.Print(ClearToEOL)
-		fmt.Print(cursorUpN(totalLines))
-		fmt.Printf(CursorBOL + cursorRightN(b.Width-currLineSpace))
+		fmt.Print(ClearToEOL + CursorUpN(totalLines) + CursorBOL + CursorRightN(b.Width-currLineSpace))
 
 		hasSpace := b.GetLineSpacing(b.DisplayPos / b.LineWidth)
 
@@ -357,8 +354,7 @@ func (b *Buffer) Remove() {
 				if b.DisplayPos%b.LineWidth == 0 {
 					// if the user backspaces over the word boundary, do this magic to clear the line
 					// and move to the end of the previous line
-					fmt.Printf(CursorBOL + ClearToEOL)
-					fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width))
+					fmt.Print(CursorBOL + ClearToEOL + CursorUp + CursorBOL + CursorRightN(b.Width))
 
 					if b.DisplaySize()%b.LineWidth < (b.DisplaySize()-rLength)%b.LineWidth {
 						b.LineHasSpace.Remove(b.DisplayPos/b.LineWidth - 1)
@@ -370,24 +366,23 @@ func (b *Buffer) Remove() {
 					}
 
 					if rLength == 2 {
-						fmt.Print(CursorLeft + "  " + cursorLeftN(2))
+						fmt.Print(CursorLeft + "  " + CursorLeftN(2))
 					} else {
 						fmt.Print(" " + CursorLeft)
 					}
 				} else if (b.DisplayPos-rLength)%b.LineWidth == 0 && hasSpace {
-					fmt.Printf(CursorBOL + ClearToEOL)
-					fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width))
+					fmt.Print(CursorBOL + ClearToEOL + CursorUp + CursorBOL + CursorRightN(b.Width))
 
 					if b.Pos == b.Buf.Size() {
 						b.LineHasSpace.Remove(b.DisplayPos/b.LineWidth - 1)
 					}
 					b.DisplayPos -= 1
 				} else {
-					fmt.Print(cursorLeftN(rLength))
+					fmt.Print(CursorLeftN(rLength))
 					for range rLength {
 						fmt.Print(" ")
 					}
-					fmt.Print(cursorLeftN(rLength))
+					fmt.Print(CursorLeftN(rLength))
 				}
 
 				var eraseExtraLine bool
@@ -405,9 +400,9 @@ func (b *Buffer) Remove() {
 					// are trailing characters which go over the line width boundary
 					if eraseExtraLine {
 						remainingLines := (b.DisplaySize() - b.DisplayPos) / b.LineWidth
-						fmt.Printf(cursorDownN(remainingLines+1) + CursorBOL + ClearToEOL)
+						fmt.Print(CursorDownN(remainingLines+1) + CursorBOL + ClearToEOL)
 						place := b.DisplayPos % b.LineWidth
-						fmt.Printf(cursorUpN(remainingLines+1) + cursorRightN(place+len(b.Prompt.prompt())))
+						fmt.Print(CursorUpN(remainingLines+1) + CursorRightN(place+len(b.Prompt.prompt())))
 					}
 				}
 			}
@@ -422,9 +417,9 @@ func (b *Buffer) Delete() {
 		if b.DisplaySize()%b.LineWidth == 0 {
 			if b.DisplayPos != b.DisplaySize() {
 				remainingLines := (b.DisplaySize() - b.DisplayPos) / b.LineWidth
-				fmt.Printf(cursorDownN(remainingLines) + CursorBOL + ClearToEOL)
+				fmt.Print(CursorDownN(remainingLines) + CursorBOL + ClearToEOL)
 				place := b.DisplayPos % b.LineWidth
-				fmt.Printf(cursorUpN(remainingLines) + cursorRightN(place+len(b.Prompt.prompt())))
+				fmt.Print(CursorUpN(remainingLines) + CursorRightN(place+len(b.Prompt.prompt())))
 			}
 		}
 	}
@@ -471,17 +466,17 @@ func (b *Buffer) DeleteWord() {
 }
 
 func (b *Buffer) ClearScreen() {
-	fmt.Printf(ClearScreen + CursorReset + b.Prompt.prompt())
+	fmt.Print(ClearScreen + CursorReset + b.Prompt.prompt())
 	if b.IsEmpty() {
 		ph := b.Prompt.placeholder()
-		fmt.Printf(ColorGrey + ph + cursorLeftN(len(ph)) + ColorDefault)
+		fmt.Print(ColorGrey + ph + CursorLeftN(len(ph)) + ColorDefault)
 	} else {
 		currPos := b.DisplayPos
 		currIndex := b.Pos
 		b.Pos = 0
 		b.DisplayPos = 0
 		b.drawRemaining()
-		fmt.Printf(CursorReset + cursorRightN(len(b.Prompt.prompt())))
+		fmt.Print(CursorReset + CursorRightN(len(b.Prompt.prompt())))
 		if currPos > 0 {
 			targetLine := currPos / b.LineWidth
 			if targetLine > 0 {
@@ -491,10 +486,10 @@ func (b *Buffer) ClearScreen() {
 			}
 			remainder := currPos % b.LineWidth
 			if remainder > 0 {
-				fmt.Print(cursorRightN(remainder))
+				fmt.Print(CursorRightN(remainder))
 			}
 			if currPos%b.LineWidth == 0 {
-				fmt.Printf(CursorBOL + b.Prompt.AltPrompt)
+				fmt.Print(CursorBOL + b.Prompt.AltPrompt)
 			}
 		}
 		b.Pos = currIndex
@@ -513,13 +508,13 @@ func (b *Buffer) Replace(r []rune) {
 
 	b.Buf.Clear()
 
-	fmt.Printf(CursorBOL + ClearToEOL)
+	fmt.Print(CursorBOL + ClearToEOL)
 
 	for range lineNums {
 		fmt.Print(CursorUp + CursorBOL + ClearToEOL)
 	}
 
-	fmt.Printf(CursorBOL + b.Prompt.prompt())
+	fmt.Print(CursorBOL + b.Prompt.prompt())
 
 	for _, c := range r {
 		b.Add(c)
@@ -545,19 +540,3 @@ func (b *Buffer) StringNM(n, m int) string {
 	}
 	return s
 }
-
-func cursorLeftN(n int) string {
-	return fmt.Sprintf(CursorLeftN, n)
-}
-
-func cursorRightN(n int) string {
-	return fmt.Sprintf(CursorRightN, n)
-}
-
-func cursorUpN(n int) string {
-	return fmt.Sprintf(CursorUpN, n)
-}
-
-func cursorDownN(n int) string {
-	return fmt.Sprintf(CursorDownN, n)
-}
diff --git a/readline/readline.go b/readline/readline.go
index e90a5e01..1c14fe10 100644
--- a/readline/readline.go
+++ b/readline/readline.go
@@ -98,7 +98,7 @@ func (i *Instance) Readline() (string, error) {
 		showPlaceholder := !i.Pasting || i.Prompt.UseAlt
 		if buf.IsEmpty() && showPlaceholder {
 			ph := i.Prompt.placeholder()
-			fmt.Printf(ColorGrey + ph + fmt.Sprintf(CursorLeftN, len(ph)) + ColorDefault)
+			fmt.Print(ColorGrey + ph + CursorLeftN(len(ph)) + ColorDefault)
 		}
 
 		r, err := i.Terminal.Read()
diff --git a/readline/types.go b/readline/types.go
index 3b88588f..e136d996 100644
--- a/readline/types.go
+++ b/readline/types.go
@@ -1,5 +1,7 @@
 package readline
 
+import "strconv"
+
 const (
 	CharNull      = 0
 	CharLineStart = 1
@@ -41,34 +43,49 @@ const (
 )
 
 const (
-	CursorUp    = "\033[1A"
-	CursorDown  = "\033[1B"
-	CursorRight = "\033[1C"
-	CursorLeft  = "\033[1D"
+	Esc = "\x1b"
 
-	CursorSave    = "\033[s"
-	CursorRestore = "\033[u"
+	CursorSave    = Esc + "[s"
+	CursorRestore = Esc + "[u"
 
-	CursorUpN    = "\033[%dA"
-	CursorDownN  = "\033[%dB"
-	CursorRightN = "\033[%dC"
-	CursorLeftN  = "\033[%dD"
+	CursorEOL  = Esc + "[E"
+	CursorBOL  = Esc + "[1G"
+	CursorHide = Esc + "[?25l"
+	CursorShow = Esc + "[?25h"
 
-	CursorEOL  = "\033[E"
-	CursorBOL  = "\033[1G"
-	CursorHide = "\033[?25l"
-	CursorShow = "\033[?25h"
+	ClearToEOL  = Esc + "[K"
+	ClearLine   = Esc + "[2K"
+	ClearScreen = Esc + "[2J"
+	CursorReset = Esc + "[0;0f"
 
-	ClearToEOL  = "\033[K"
-	ClearLine   = "\033[2K"
-	ClearScreen = "\033[2J"
-	CursorReset = "\033[0;0f"
+	ColorGrey    = Esc + "[38;5;245m"
+	ColorDefault = Esc + "[0m"
 
-	ColorGrey    = "\033[38;5;245m"
-	ColorDefault = "\033[0m"
+	StartBracketedPaste = Esc + "[?2004h"
+	EndBracketedPaste   = Esc + "[?2004l"
+)
 
-	StartBracketedPaste = "\033[?2004h"
-	EndBracketedPaste   = "\033[?2004l"
+func CursorUpN(n int) string {
+	return Esc + "[" + strconv.Itoa(n) + "A"
+}
+
+func CursorDownN(n int) string {
+	return Esc + "[" + strconv.Itoa(n) + "B"
+}
+
+func CursorRightN(n int) string {
+	return Esc + "[" + strconv.Itoa(n) + "C"
+}
+
+func CursorLeftN(n int) string {
+	return Esc + "[" + strconv.Itoa(n) + "D"
+}
+
+var (
+	CursorUp    = CursorUpN(1)
+	CursorDown  = CursorDownN(1)
+	CursorRight = CursorRightN(1)
+	CursorLeft  = CursorLeftN(1)
 )
 
 const (
diff --git a/server/sched.go b/server/sched.go
index c378865b..9947fd32 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -418,7 +418,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
 		// some older models are not compatible with newer versions of llama.cpp
 		// show a generalized compatibility error until there is a better way to
 		// check for model compatibility
-		if errors.Is(llm.ErrUnsupportedFormat, err) || strings.Contains(err.Error(), "failed to load model") {
+		if errors.Is(err, llm.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
 			err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
 		}
 		slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)

From eda8a32a0936c1aec120b3c544e402cbba7b7eb7 Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Tue, 13 Aug 2024 23:39:18 +0000
Subject: [PATCH 40/71] update chatml template format to latest in docs (#6344)

---
 docs/template.md | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/docs/template.md b/docs/template.md
index f6ce06ba..1d7104de 100644
--- a/docs/template.md
+++ b/docs/template.md
@@ -112,15 +112,9 @@ Keep the following tips and best practices in mind when working with Go template
 ChatML is a popular template format. It can be used for models such as Databrick's DBRX, Intel's Neural Chat, and Microsoft's Orca 2.
 
 ```gotmpl
-{{- if .System }}<|im_start|>system
-{{ .System }}<|im_end|>
-{{ end }}
 {{- range .Messages }}<|im_start|>{{ .Role }}
 {{ .Content }}<|im_end|>
 {{ end }}<|im_start|>assistant
-{{ else }}
-{{ if .System }}<|im_start|>system
-{{ .System }}<|im_end|>
 ```
 
 ### Example Tools

From 8e1050f366e5451651f8385fa570b78b9c7d21cc Mon Sep 17 00:00:00 2001
From: Blake Mizerany <blake.mizerany@gmail.com>
Date: Tue, 13 Aug 2024 16:47:35 -0700
Subject: [PATCH 41/71] server: reduce max connections used in download (#6347)

The previous value of 64 was WAY too high and unnecessary. It reached
diminishing returns and blew past it. This is a more reasonable number
for _most_ normal cases. For users on cloud servers with excellent
network quality, this will keep screaming for them, without hitting our
CDN limits. For users with relatively poor network quality, this will
keep them from saturating their network and causing other issues.
---
 server/download.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/download.go b/server/download.go
index 1bca86bf..02f7ae88 100644
--- a/server/download.go
+++ b/server/download.go
@@ -94,7 +94,7 @@ func (p *blobDownloadPart) UnmarshalJSON(b []byte) error {
 }
 
 const (
-	numDownloadParts          = 64
+	numDownloadParts          = 16
 	minDownloadPartSize int64 = 100 * format.MegaByte
 	maxDownloadPartSize int64 = 1000 * format.MegaByte
 )

From 0a8d6ea86d54bbda9d701c38e4279a9c5c204cd9 Mon Sep 17 00:00:00 2001
From: longtao <39115651+eust-w@users.noreply.github.com>
Date: Wed, 14 Aug 2024 08:54:19 +0800
Subject: [PATCH 42/71] Fix typo and improve readability (#5964)

* Fix typo and improve readability

Summary:
* Rename updatAvailableMenuID to updateAvailableMenuID
* Replace unused cmd parameter with _ in RunServer function
* Fix typos in comments

(cherry picked from commit 5b8715f0b04773369e8eb1f9e6737995a0ab3ba7)

* Update api/client.go

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

---------

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>
---
 api/client.go             |  4 ++--
 app/tray/wintray/menus.go | 14 +++++++-------
 cmd/cmd.go                |  2 +-
 types/model/name.go       |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/api/client.go b/api/client.go
index bbdf8202..2528fb21 100644
--- a/api/client.go
+++ b/api/client.go
@@ -298,7 +298,7 @@ func (c *Client) List(ctx context.Context) (*ListResponse, error) {
 	return &lr, nil
 }
 
-// List running models.
+// ListRunning lists running models.
 func (c *Client) ListRunning(ctx context.Context) (*ProcessResponse, error) {
 	var lr ProcessResponse
 	if err := c.do(ctx, http.MethodGet, "/api/ps", nil, &lr); err != nil {
@@ -333,7 +333,7 @@ func (c *Client) Show(ctx context.Context, req *ShowRequest) (*ShowResponse, err
 	return &resp, nil
 }
 
-// Hearbeat checks if the server has started and is responsive; if yes, it
+// Heartbeat checks if the server has started and is responsive; if yes, it
 // returns nil, otherwise an error.
 func (c *Client) Heartbeat(ctx context.Context) error {
 	if err := c.do(ctx, http.MethodHead, "/", nil, nil); err != nil {
diff --git a/app/tray/wintray/menus.go b/app/tray/wintray/menus.go
index 9cb3b893..59624444 100644
--- a/app/tray/wintray/menus.go
+++ b/app/tray/wintray/menus.go
@@ -11,12 +11,12 @@ import (
 )
 
 const (
-	updatAvailableMenuID = 1
-	updateMenuID         = updatAvailableMenuID + 1
-	separatorMenuID      = updateMenuID + 1
-	diagLogsMenuID       = separatorMenuID + 1
-	diagSeparatorMenuID  = diagLogsMenuID + 1
-	quitMenuID           = diagSeparatorMenuID + 1
+	updateAvailableMenuID = 1
+	updateMenuID          = updateAvailableMenuID + 1
+	separatorMenuID       = updateMenuID + 1
+	diagLogsMenuID        = separatorMenuID + 1
+	diagSeparatorMenuID   = diagLogsMenuID + 1
+	quitMenuID            = diagSeparatorMenuID + 1
 )
 
 func (t *winTray) initMenus() error {
@@ -35,7 +35,7 @@ func (t *winTray) initMenus() error {
 func (t *winTray) UpdateAvailable(ver string) error {
 	if !t.updateNotified {
 		slog.Debug("updating menu and sending notification for new update")
-		if err := t.addOrUpdateMenuItem(updatAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil {
+		if err := t.addOrUpdateMenuItem(updateAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil {
 			return fmt.Errorf("unable to create menu entries %w", err)
 		}
 		if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenutTitle, false); err != nil {
diff --git a/cmd/cmd.go b/cmd/cmd.go
index 2356110e..fd7246c8 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1125,7 +1125,7 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 	return nil
 }
 
-func RunServer(cmd *cobra.Command, _ []string) error {
+func RunServer(_ *cobra.Command, _ []string) error {
 	if err := initializeKeypair(); err != nil {
 		return err
 	}
diff --git a/types/model/name.go b/types/model/name.go
index 018cb2f5..75b35ef7 100644
--- a/types/model/name.go
+++ b/types/model/name.go
@@ -219,7 +219,7 @@ func (n Name) String() string {
 	return b.String()
 }
 
-// DisplayShort returns a short string version of the name.
+// DisplayShortest returns a short string version of the name.
 func (n Name) DisplayShortest() string {
 	var sb strings.Builder
 

From 8200c371aed68dec5c74e869491ee8e5749ba1eb Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Wed, 14 Aug 2024 15:19:50 -0700
Subject: [PATCH 43/71] add `CONTRIBUTING.md` (#6349)

---
 CONTRIBUTING.md | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 CONTRIBUTING.md

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 00000000..7f12a0fc
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,37 @@
+# Contributing to Ollama
+
+Thank you for your interest in contributing to Ollama! Here are a few guidelines to help get you started.
+
+## Set up
+
+See the [development documentation](./docs/development.md) for instructions on how to build and run Ollama locally.
+
+## Pull requests
+
+### Ideal issues
+
+* [Bugs](https://github.com/ollama/ollama/issues?q=is%3Aissue+is%3Aopen+label%3Abug): issues where Ollama stops working or where it results in an unexpected error.
+* [Performance](https://github.com/ollama/ollama/issues?q=is%3Aissue+is%3Aopen+label%3Aperformance): issues to make Ollama faster at model inference, downloading or uploading.
+* [Security](https://github.com/ollama/ollama/blob/main/SECURITY.md): issues that could lead to a security vulnerability. As mentioned in [SECURITY.md](https://github.com/ollama/ollama/blob/main/SECURITY.md), please do not disclose security vulnerabilities publicly.
+
+### Issues that are harder to review
+
+* New features: new features (e.g. API fields, environment variables) add surface area to Ollama and make it harder to maintain in the long run as they cannot be removed without potentially breaking users in the future.
+* Refactoring: large code improvements are important, but can be harder or take longer to review and merge.
+* Documentation: small updates to fill in or dorrect missing documentation is helpful, however large documentation additions can be hard to maintain over time.
+
+### Issues that may not be accepted
+
+* Changes that break backwards compatibility in Ollama's API (including the OpenAI-compatible API)
+* Changes that add significant friction to the user experience
+* Changes that create a large future maintenance burden for maintainers and contributors
+
+### Best practices
+
+* Commit messages: please leave both a title and a description in your commit messages. The title should be a short summary of the changes, with a leading word that explains the section of the code being changed (e.g. `api: fix parsing of prompt field`) . In the description, leave a short 2-3 sentences that explain more about the change and its impact.
+* Tests: please add test coverage to changes where possible.
+* Minimize dependencies: avoid adding new dependencies unless absolutely necessary.
+
+## Need help?
+
+If you need help with anything, feel free to reach out to us on our [Discord server](https://discord.gg/ollama).

From b3f75fc812fc1559090a7fd9739bd203817a5979 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 14 Aug 2024 14:37:51 -0700
Subject: [PATCH 44/71] fix noprune

---
 server/images.go | 63 ++++++++++++++++++------------------------------
 1 file changed, 24 insertions(+), 39 deletions(-)

diff --git a/server/images.go b/server/images.go
index 0e753f56..798ed818 100644
--- a/server/images.go
+++ b/server/images.go
@@ -215,25 +215,20 @@ func GetManifest(mp ModelPath) (*Manifest, string, error) {
 		return nil, "", err
 	}
 
-	if _, err = os.Stat(fp); err != nil {
-		return nil, "", err
-	}
-
-	var manifest *Manifest
-
-	bts, err := os.ReadFile(fp)
+	f, err := os.Open(fp)
 	if err != nil {
-		return nil, "", fmt.Errorf("couldn't open file '%s'", fp)
+		return nil, "", err
 	}
+	defer f.Close()
 
-	shaSum := sha256.Sum256(bts)
-	shaStr := hex.EncodeToString(shaSum[:])
+	sha256sum := sha256.New()
 
-	if err := json.Unmarshal(bts, &manifest); err != nil {
+	var manifest Manifest
+	if err := json.NewDecoder(io.TeeReader(f, sha256sum)).Decode(&manifest); err != nil {
 		return nil, "", err
 	}
 
-	return manifest, shaStr, nil
+	return &manifest, hex.EncodeToString(sha256sum.Sum(nil)), nil
 }
 
 func GetModel(name string) (*Model, error) {
@@ -716,7 +711,7 @@ func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{})
 		// save (i.e. delete from the deleteMap) any files used in other manifests
 		manifest, _, err := GetManifest(fmp)
 		if err != nil {
-			return err
+			return fmt.Errorf("error reading manifest %s: %w", path, err)
 		}
 
 		for _, layer := range manifest.Layers {
@@ -781,8 +776,7 @@ func PruneLayers() error {
 
 	slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap)))
 
-	err = deleteUnusedLayers(nil, deleteMap)
-	if err != nil {
+	if err := deleteUnusedLayers(nil, deleteMap); err != nil {
 		slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err))
 		return nil
 	}
@@ -877,26 +871,19 @@ func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn func(api.ProgressResponse)) error {
 	mp := ParseModelPath(name)
 
-	var manifest *Manifest
-	var err error
-	var noprune string
-
 	// build deleteMap to prune unused layers
 	deleteMap := make(map[string]struct{})
-
-	if !envconfig.NoPrune() {
-		manifest, _, err = GetManifest(mp)
-		if err != nil && !errors.Is(err, os.ErrNotExist) {
-			return err
+	manifest, _, err := GetManifest(mp)
+	if errors.Is(err, os.ErrNotExist) {
+		// noop
+	} else if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	} else {
+		for _, l := range manifest.Layers {
+			deleteMap[l.Digest] = struct{}{}
 		}
-
-		if manifest != nil {
-			for _, l := range manifest.Layers {
-				deleteMap[l.Digest] = struct{}{}
-			}
-			if manifest.Config.Digest != "" {
-				deleteMap[manifest.Config.Digest] = struct{}{}
-			}
+		if manifest.Config.Digest != "" {
+			deleteMap[manifest.Config.Digest] = struct{}{}
 		}
 	}
 
@@ -975,11 +962,9 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 		return err
 	}
 
-	if noprune == "" {
-		fn(api.ProgressResponse{Status: "removing any unused layers"})
-		err = deleteUnusedLayers(nil, deleteMap)
-		if err != nil {
-			slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err))
+	if !envconfig.NoPrune() && len(deleteMap) > 0 {
+		fn(api.ProgressResponse{Status: "removing unused layers"})
+		if err := deleteUnusedLayers(nil, deleteMap); err != nil {
 			fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't remove unused layers: %v", err)})
 		}
 	}
@@ -1000,12 +985,12 @@ func pullModelManifest(ctx context.Context, mp ModelPath, regOpts *registryOptio
 	}
 	defer resp.Body.Close()
 
-	var m *Manifest
+	var m Manifest
 	if err := json.NewDecoder(resp.Body).Decode(&m); err != nil {
 		return nil, err
 	}
 
-	return m, err
+	return &m, err
 }
 
 // GetSHA256Digest returns the SHA256 hash of a given buffer and returns it, and the size of buffer

From 237dccba1edb41bb65ed1ffc6eafdd40dd6085e4 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 14 Aug 2024 16:36:07 -0700
Subject: [PATCH 45/71] skip invalid manifest files

---
 server/images.go   | 35 +++++------------------------------
 server/manifest.go |  2 +-
 2 files changed, 6 insertions(+), 31 deletions(-)

diff --git a/server/images.go b/server/images.go
index 798ed818..8b3a67cf 100644
--- a/server/images.go
+++ b/server/images.go
@@ -687,43 +687,18 @@ func CopyModel(src, dst model.Name) error {
 	return err
 }
 
-func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}) error {
-	fp, err := GetManifestPath()
+func deleteUnusedLayers(deleteMap map[string]struct{}) error {
+	manifests, err := Manifests()
 	if err != nil {
 		return err
 	}
 
-	walkFunc := func(path string, info os.FileInfo, _ error) error {
-		if info.IsDir() {
-			return nil
-		}
-
-		dir, file := filepath.Split(path)
-		dir = strings.Trim(strings.TrimPrefix(dir, fp), string(os.PathSeparator))
-		tag := strings.Join([]string{dir, file}, ":")
-		fmp := ParseModelPath(tag)
-
-		// skip the manifest we're trying to delete
-		if skipModelPath != nil && skipModelPath.GetFullTagname() == fmp.GetFullTagname() {
-			return nil
-		}
-
-		// save (i.e. delete from the deleteMap) any files used in other manifests
-		manifest, _, err := GetManifest(fmp)
-		if err != nil {
-			return fmt.Errorf("error reading manifest %s: %w", path, err)
-		}
-
+	for _, manifest := range manifests {
 		for _, layer := range manifest.Layers {
 			delete(deleteMap, layer.Digest)
 		}
 
 		delete(deleteMap, manifest.Config.Digest)
-		return nil
-	}
-
-	if err := filepath.Walk(fp, walkFunc); err != nil {
-		return err
 	}
 
 	// only delete the files which are still in the deleteMap
@@ -776,7 +751,7 @@ func PruneLayers() error {
 
 	slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap)))
 
-	if err := deleteUnusedLayers(nil, deleteMap); err != nil {
+	if err := deleteUnusedLayers(deleteMap); err != nil {
 		slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err))
 		return nil
 	}
@@ -964,7 +939,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 
 	if !envconfig.NoPrune() && len(deleteMap) > 0 {
 		fn(api.ProgressResponse{Status: "removing unused layers"})
-		if err := deleteUnusedLayers(nil, deleteMap); err != nil {
+		if err := deleteUnusedLayers(deleteMap); err != nil {
 			fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't remove unused layers: %v", err)})
 		}
 	}
diff --git a/server/manifest.go b/server/manifest.go
index 6a5d7b88..0f19641d 100644
--- a/server/manifest.go
+++ b/server/manifest.go
@@ -150,7 +150,7 @@ func Manifests() (map[model.Name]*Manifest, error) {
 
 			n := model.ParseNameFromFilepath(rel)
 			if !n.IsValid() {
-				slog.Warn("bad manifest name", "path", rel, "error", err)
+				slog.Warn("bad manifest name", "path", rel)
 				continue
 			}
 

From 3a75e74e34c976d596437c8aa14587ada562301e Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Thu, 15 Aug 2024 10:29:14 -0700
Subject: [PATCH 46/71] only skip invalid json manifests

---
 server/manifest.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/server/manifest.go b/server/manifest.go
index 0f19641d..6b04753f 100644
--- a/server/manifest.go
+++ b/server/manifest.go
@@ -5,6 +5,7 @@ import (
 	"encoding/hex"
 	"encoding/json"
 	"errors"
+	"fmt"
 	"io"
 	"log/slog"
 	"os"
@@ -155,9 +156,11 @@ func Manifests() (map[model.Name]*Manifest, error) {
 			}
 
 			m, err := ParseNamedManifest(n)
-			if err != nil {
+			if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) {
 				slog.Warn("bad manifest", "name", n, "error", err)
 				continue
+			} else if err != nil {
+				return nil, fmt.Errorf("%s: %w", n, err)
 			}
 
 			ms[n] = m

From a84c05cf9140c2eb288a6c7b56bb1c592bbaacc7 Mon Sep 17 00:00:00 2001
From: eust-w <eustancewu@gmail.com>
Date: Fri, 16 Aug 2024 06:00:12 +0800
Subject: [PATCH 47/71] fix: Add tooltip to system tray icon

- Updated setIcon method to include tooltip text for the system tray icon.
- Added NIF_TIP flag and set the tooltip text using UTF16 encoding.

Resolves: #6372
---
 app/tray/wintray/tray.go   | 8 +++++++-
 app/tray/wintray/w32api.go | 1 +
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/app/tray/wintray/tray.go b/app/tray/wintray/tray.go
index ccd087a1..6f827893 100644
--- a/app/tray/wintray/tray.go
+++ b/app/tray/wintray/tray.go
@@ -11,6 +11,7 @@ import (
 	"path/filepath"
 	"sort"
 	"sync"
+	"syscall"
 	"unsafe"
 
 	"golang.org/x/sys/windows"
@@ -433,7 +434,12 @@ func (t *winTray) setIcon(src string) error {
 	t.muNID.Lock()
 	defer t.muNID.Unlock()
 	t.nid.Icon = h
-	t.nid.Flags |= NIF_ICON
+	t.nid.Flags |= NIF_ICON | NIF_TIP
+	if toolTipUTF16, err := syscall.UTF16FromString(commontray.ToolTip); err == nil {
+		copy(t.nid.Tip[:], toolTipUTF16)
+	} else {
+		return err
+	}
 	t.nid.Size = uint32(unsafe.Sizeof(*t.nid))
 
 	return t.nid.modify()
diff --git a/app/tray/wintray/w32api.go b/app/tray/wintray/w32api.go
index a1e0381d..7c7c0ac8 100644
--- a/app/tray/wintray/w32api.go
+++ b/app/tray/wintray/w32api.go
@@ -61,6 +61,7 @@ const (
 	MIIM_SUBMENU        = 0x00000004
 	MIM_APPLYTOSUBMENUS = 0x80000000
 	NIF_ICON            = 0x00000002
+	NIF_TIP             = 0x00000004
 	NIF_INFO            = 0x00000010
 	NIF_MESSAGE         = 0x00000001
 	SW_HIDE             = 0

From bdc4308afb72d47ce63583427f810b02d569d58a Mon Sep 17 00:00:00 2001
From: zwwhdls <zww@hdls.me>
Date: Fri, 16 Aug 2024 11:43:19 +0800
Subject: [PATCH 48/71] fix: chmod new layer to 0o644 when creating it

Signed-off-by: zwwhdls <zww@hdls.me>
---
 server/layer.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/server/layer.go b/server/layer.go
index c666bd10..0bdee72b 100644
--- a/server/layer.go
+++ b/server/layer.go
@@ -51,6 +51,9 @@ func NewLayer(r io.Reader, mediatype string) (Layer, error) {
 		if err := os.Rename(temp.Name(), blob); err != nil {
 			return Layer{}, err
 		}
+		if err := os.Chmod(blob, 0o644); err != nil {
+			return Layer{}, err
+		}
 	}
 
 	return Layer{

From 0ad0e738cd7ed1266b3c210ad54dcd2b70142563 Mon Sep 17 00:00:00 2001
From: Richard Lyons <frob@cloudstaff.com>
Date: Sun, 18 Aug 2024 01:43:26 +0200
Subject: [PATCH 49/71] Override numParallel only if unset.

---
 server/sched.go | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/server/sched.go b/server/sched.go
index 9947fd32..4d9c0296 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -734,7 +734,9 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
 
 // If multiple Libraries are detected, pick the Library which loads the most layers for the model
 func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
-	*numParallel = 1
+	if *numParallel <= 0 {
+		*numParallel = 1
+        }
 	byLibrary := gpus.ByLibrary()
 	if len(byLibrary) <= 1 {
 		return gpus

From 9352eeb752531decccc7c6b91a07bc3dd5efa67e Mon Sep 17 00:00:00 2001
From: Richard Lyons <frob@cloudstaff.com>
Date: Sun, 18 Aug 2024 02:55:01 +0200
Subject: [PATCH 50/71] Reset NumCtx.

---
 server/sched.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/server/sched.go b/server/sched.go
index 4d9c0296..3fe6d7fc 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -736,6 +736,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
 func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
 	if *numParallel <= 0 {
 		*numParallel = 1
+                req.opts.NumCtx = req.origNumCtx
         }
 	byLibrary := gpus.ByLibrary()
 	if len(byLibrary) <= 1 {

From 885cf45087863aa2e064a05da99e8bd07d69970a Mon Sep 17 00:00:00 2001
From: Richard Lyons <frob@cloudstaff.com>
Date: Sun, 18 Aug 2024 03:07:16 +0200
Subject: [PATCH 51/71] Fix white space.

---
 server/sched.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/server/sched.go b/server/sched.go
index 3fe6d7fc..9d8c4144 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -736,8 +736,8 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
 func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
 	if *numParallel <= 0 {
 		*numParallel = 1
-                req.opts.NumCtx = req.origNumCtx
-        }
+		req.opts.NumCtx = req.origNumCtx
+	}
 	byLibrary := gpus.ByLibrary()
 	if len(byLibrary) <= 1 {
 		return gpus

From 9fddef3731842bd8f40d217da6b84ab7ef5dfe97 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Mon, 19 Aug 2024 09:20:52 -0700
Subject: [PATCH 52/71] server: limit upload parts to 16 (#6411)

---
 server/upload.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/upload.go b/server/upload.go
index 2f115436..020e8955 100644
--- a/server/upload.go
+++ b/server/upload.go
@@ -45,7 +45,7 @@ type blobUpload struct {
 }
 
 const (
-	numUploadParts          = 64
+	numUploadParts          = 16
 	minUploadPartSize int64 = 100 * format.MegaByte
 	maxUploadPartSize int64 = 1000 * format.MegaByte
 )

From 74d45f010276c2f2653f3ca8c4f76cb0552fb46e Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Mon, 8 Jul 2024 12:50:11 -0700
Subject: [PATCH 53/71] Refactor linux packaging

This adjusts linux to follow a similar model to windows with a discrete archive
(zip/tgz) to cary the primary executable, and dependent libraries. Runners are
still carried as payloads inside the main binary

Darwin retain the payload model where the go binary is fully self contained.
---
 .github/workflows/release.yaml |  1 -
 Dockerfile                     | 29 ++++++------
 app/ollama.iss                 | 11 +----
 envconfig/config.go            |  4 +-
 gpu/amd_common.go              |  2 +-
 gpu/amd_windows.go             |  2 +-
 gpu/gpu.go                     | 50 ++++++++++++++-------
 gpu/gpu_linux.go               |  2 +-
 llm/ext_server/CMakeLists.txt  |  3 +-
 llm/generate/gen_common.sh     | 17 ++++++-
 llm/generate/gen_linux.sh      | 81 ++++++++++++++++------------------
 llm/generate/gen_windows.ps1   | 43 +++++++++---------
 llm/server.go                  | 12 +++--
 scripts/build_linux.sh         | 10 ++---
 scripts/build_windows.ps1      | 12 ++---
 scripts/install.sh             | 31 ++++++++++---
 16 files changed, 171 insertions(+), 139 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 5ae630c3..9287f6f7 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -363,7 +363,6 @@ jobs:
       - run: |
           ./scripts/build_linux.sh
           ./scripts/build_docker.sh
-          mv dist/deps/* dist/
       - uses: actions/upload-artifact@v4
         with:
           name: dist-linux-amd64
diff --git a/Dockerfile b/Dockerfile
index c8efdd8a..120ddc21 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,6 +18,7 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
+ENV GOARCH amd64 
 RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 
 FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
@@ -28,6 +29,7 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
+ENV GOARCH arm64 
 RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
@@ -40,15 +42,10 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG AMDGPU_TARGETS
+ENV GOARCH amd64 
 RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
-RUN mkdir /tmp/scratch && \
-    for dep in $(zcat /go/src/github.com/ollama/ollama/llm/build/linux/x86_64/rocm*/bin/deps.txt.gz) ; do \
-        cp ${dep} /tmp/scratch/ || exit 1 ; \
-    done && \
-    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd /tmp/scratch/ && tar xf - ) && \
-    mkdir -p /go/src/github.com/ollama/ollama/dist/deps/ && \
-    (cd /tmp/scratch/ && tar czvf /go/src/github.com/ollama/ollama/dist/deps/ollama-linux-amd64-rocm.tgz . )
-
+RUN mkdir -p ../../dist/linux-amd64/ollama_libs && \
+    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64/ollama_libs && tar xf - )
 
 FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
 ARG CMAKE_VERSION
@@ -59,6 +56,7 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
+ENV GOARCH amd64 
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 
 FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64
@@ -79,6 +77,7 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
+ENV GOARCH arm64
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 
 FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64
@@ -95,12 +94,13 @@ COPY . .
 COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/
 ARG GOFLAGS
 ARG CGO_CFLAGS
-RUN go build -trimpath .
+RUN go build -trimpath -o dist/linux-amd64/ollama .
 
 # Intermediate stage used for ./scripts/build_linux.sh
 FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
@@ -109,23 +109,24 @@ ARG GOLANG_VERSION
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 ARG GOFLAGS
 ARG CGO_CFLAGS
-RUN go build -trimpath .
+RUN go build -trimpath -o dist/linux-arm64/ollama .
 
 # Runtime stages
 FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64
 RUN apt-get update && apt-get install -y ca-certificates
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/ollama /bin/ollama
 FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64
 RUN apt-get update && apt-get install -y ca-certificates
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
+COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/ollama /bin/ollama
 
 # Radeon images are much larger so we keep it distinct from the CPU/CUDA image
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete as runtime-rocm
 RUN update-pciids
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/ollama /bin/ollama
 EXPOSE 11434
 ENV OLLAMA_HOST 0.0.0.0
 
diff --git a/app/ollama.iss b/app/ollama.iss
index dc6178f7..e9cf48ec 100644
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -91,16 +91,7 @@ Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
 Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
 Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
 Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
-#if DirExists("..\dist\windows-amd64\cuda")
-  Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs
-#endif
-#if DirExists("..\dist\windows-amd64\oneapi")
-  Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs
-#endif
-#if DirExists("..\dist\windows-amd64\rocm")
-  Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
-#endif
-
+Source: "..\dist\windows-amd64\ollama_libs\*"; DestDir: "{app}\ollama_libs\"; Flags: ignoreversion recursesubdirs
 
 [Icons]
 Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
diff --git a/envconfig/config.go b/envconfig/config.go
index b82b773d..7f0976c0 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -193,8 +193,8 @@ func RunnersDir() (p string) {
 	for _, root := range []string{filepath.Dir(exe), cwd} {
 		paths = append(paths,
 			root,
-			filepath.Join(root, "windows-"+runtime.GOARCH),
-			filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
+			filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
+			filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
 		)
 	}
 
diff --git a/gpu/amd_common.go b/gpu/amd_common.go
index 2839cb7c..05747208 100644
--- a/gpu/amd_common.go
+++ b/gpu/amd_common.go
@@ -54,7 +54,7 @@ func commonAMDValidateLibDir() (string, error) {
 	// Installer payload location if we're running the installed binary
 	exe, err := os.Executable()
 	if err == nil {
-		rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
+		rocmTargetDir := filepath.Join(filepath.Dir(exe), "ollama_libs")
 		if rocmLibUsable(rocmTargetDir) {
 			slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
 			return rocmTargetDir, nil
diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go
index edabeb43..5d25a966 100644
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -153,7 +153,7 @@ func AMDValidateLibDir() (string, error) {
 	// Installer payload (if we're running from some other location)
 	localAppData := os.Getenv("LOCALAPPDATA")
 	appDir := filepath.Join(localAppData, "Programs", "Ollama")
-	rocmTargetDir := filepath.Join(appDir, "rocm")
+	rocmTargetDir := filepath.Join(appDir, "ollama_libs")
 	if rocmLibUsable(rocmTargetDir) {
 		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
 		return rocmTargetDir, nil
diff --git a/gpu/gpu.go b/gpu/gpu.go
index dc124a3e..d0ae0f34 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -229,11 +229,7 @@ func GetGPUInfo() GpuInfoList {
 			return GpuInfoList{cpus[0].GpuInfo}
 		}
 
-		// On windows we bundle the nvidia library one level above the runner dir
-		depPath := ""
-		if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
-			depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "cuda")
-		}
+		depPath := GetDepDir()
 
 		// Load ALL libraries
 		cHandles = initCudaHandles()
@@ -306,13 +302,6 @@ func GetGPUInfo() GpuInfoList {
 		if envconfig.IntelGPU() {
 			oHandles = initOneAPIHandles()
 			if oHandles != nil && oHandles.oneapi != nil {
-
-				// On windows we bundle the oneapi library one level above the runner dir
-				depPath = ""
-				if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
-					depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "oneapi")
-				}
-
 				for d := range oHandles.oneapi.num_drivers {
 					if oHandles.oneapi == nil {
 						// shouldn't happen
@@ -467,10 +456,12 @@ func GetGPUInfo() GpuInfoList {
 func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 	// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
 	var ldPaths []string
-	var patterns []string
 	gpuLibPaths := []string{}
 	slog.Debug("Searching for GPU library", "name", baseLibName)
 
+	// Start with our bundled libraries
+	patterns := []string{filepath.Join(GetDepDir(), baseLibName)}
+
 	switch runtime.GOOS {
 	case "windows":
 		ldPaths = strings.Split(os.Getenv("PATH"), ";")
@@ -479,13 +470,14 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 	default:
 		return gpuLibPaths
 	}
-	// Start with whatever we find in the PATH/LD_LIBRARY_PATH
+
+	// Then with whatever we find in the PATH/LD_LIBRARY_PATH
 	for _, ldPath := range ldPaths {
 		d, err := filepath.Abs(ldPath)
 		if err != nil {
 			continue
 		}
-		patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
+		patterns = append(patterns, filepath.Join(d, baseLibName))
 	}
 	patterns = append(patterns, defaultPatterns...)
 	slog.Debug("gpu library search", "globs", patterns)
@@ -641,3 +633,31 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 		return "", ""
 	}
 }
+
+func GetDepDir() string {
+	// On Windows/linux we bundle the dependencies at the same level as the executable
+	appExe, err := os.Executable()
+	if err != nil {
+		slog.Warn("failed to lookup executable path", "error", err)
+	}
+	cwd, err := os.Getwd()
+	if err != nil {
+		slog.Warn("failed to lookup working directory", "error", err)
+	}
+	// Scan for any of our dependeices, and pick first match
+	for _, root := range []string{filepath.Dir(appExe), cwd} {
+		libDep := "ollama_libs"
+		if _, err := os.Stat(filepath.Join(root, libDep)); err == nil {
+			return filepath.Join(root, libDep)
+		}
+		// Developer mode, local build
+		if _, err := os.Stat(filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
+			return filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)
+		}
+		if _, err := os.Stat(filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
+			return filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)
+		}
+	}
+	slog.Warn("unable to locate gpu dependency libraries")
+	return ""
+}
diff --git a/gpu/gpu_linux.go b/gpu/gpu_linux.go
index d6d2675c..d4d20bc4 100644
--- a/gpu/gpu_linux.go
+++ b/gpu/gpu_linux.go
@@ -47,7 +47,7 @@ var (
 	CudartMgmtName = "libcudart.so*"
 	NvcudaMgmtName = "libcuda.so*"
 	NvmlMgmtName   = "" // not currently wired on linux
-	OneapiMgmtName = "libze_intel_gpu.so"
+	OneapiMgmtName = "libze_intel_gpu.so*"
 )
 
 func GetCPUMem() (memInfo, error) {
diff --git a/llm/ext_server/CMakeLists.txt b/llm/ext_server/CMakeLists.txt
index bfc97c63..90fd0ef2 100644
--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@@ -1,12 +1,13 @@
 set(TARGET ollama_llama_server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
+set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
     SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
-target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_SERVER_LDFLAGS})
 if (WIN32)
     TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh
index da1b0688..f1541f2a 100644
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@@ -9,11 +9,14 @@ init_vars() {
         ARCH="arm64"
         ;;
     *)
-        ARCH=$(uname -m | sed -e "s/aarch64/arm64/g")
+        echo "GOARCH must be set"
+        echo "this script is meant to be run from within go generate"
+        exit 1
+        ;;
     esac
 
     LLAMACPP_DIR=../llama.cpp
-    CMAKE_DEFS=""
+    CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on"
     CMAKE_TARGETS="--target ollama_llama_server"
     if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
         CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
@@ -27,6 +30,7 @@ init_vars() {
         WHOLE_ARCHIVE="-Wl,-force_load"
         NO_WHOLE_ARCHIVE=""
         GCC_ARCH="-arch ${ARCH}"
+        DIST_BASE=../../dist/darwin-${GOARCH}/
         ;;
     "Linux")
         LIB_EXT="so"
@@ -35,6 +39,7 @@ init_vars() {
 
         # Cross compiling not supported on linux - Use docker
         GCC_ARCH=""
+        DIST_BASE=../../dist/linux-${GOARCH}/
         ;;
     *)
         ;;
@@ -105,6 +110,14 @@ compress() {
     echo "Finished compression"
 }
 
+install() {
+    echo "Installing libraries to bin dir ${BUILD_DIR}/bin/"
+    for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT}); do
+        rm -f "${BUILD_DIR}/bin/$(basename ${lib})"
+        cp -af "${lib}" "${BUILD_DIR}/bin/"
+    done
+}
+
 # Keep the local tree clean after we're done with the build
 cleanup() {
     (cd ${LLAMACPP_DIR}/ && git checkout CMakeLists.txt)
diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index db2c6c30..70fc0313 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -51,7 +51,7 @@ if [ -z "${CUDACXX}" ]; then
         export CUDACXX=$(command -v nvcc)
     fi
 fi
-COMMON_CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
+COMMON_CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
 source $(dirname $0)/gen_common.sh
 init_vars
 git_module_setup
@@ -77,10 +77,11 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
     if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
         init_vars
         echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
-        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
+        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
         BUILD_DIR="../build/linux/${ARCH}/cpu"
         echo "Building custom CPU"
         build
+        install
         compress
     else
         # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
@@ -93,7 +94,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
         # -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake
         # -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake
 
-        COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
+        COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
         if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
             #
             # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
@@ -103,6 +104,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
             BUILD_DIR="../build/linux/${ARCH}/cpu"
             echo "Building LCD CPU"
             build
+            install
             compress
         fi
 
@@ -120,6 +122,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
                 BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
                 echo "Building AVX CPU"
                 build
+                install
                 compress
             fi
 
@@ -133,6 +136,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
                 BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
                 echo "Building AVX2 CPU"
                 build
+                install
                 compress
             fi
         fi
@@ -178,29 +182,18 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
         CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
         echo "Building custom CUDA GPU"
     else
-        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
+        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
     fi
-    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
+    export CUDAFLAGS="-t8"
+    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
     BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
-    EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
+    export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
+    CUDA_DIST_DIR="${DIST_BASE}/ollama_libs"
     build
-
-    # Carry the CUDA libs as payloads to help reduce dependency burden on users
-    #
-    # TODO - in the future we may shift to packaging these separately and conditionally
-    #        downloading them in the install script.
-    DEPS="$(ldd ${BUILD_DIR}/bin/ollama_llama_server )"
-    for lib in libcudart.so libcublas.so libcublasLt.so ; do
-        DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
-        if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
-            cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/bin/"
-        elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
-            cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/bin/"
-        elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
-            cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/bin/"
-        else
-            cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/bin/"
-        fi
+    install
+    mkdir -p "${CUDA_DIST_DIR}"
+    for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
+        cp -a "${lib}" "${CUDA_DIST_DIR}"
     done
     compress
 
@@ -218,21 +211,24 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
     CC=icx
     CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
     BUILD_DIR="../build/linux/${ARCH}/oneapi"
-    EXTRA_LIBS="-fsycl -Wl,-rpath,${ONEAPI_ROOT}/compiler/latest/lib,-rpath,${ONEAPI_ROOT}/mkl/latest/lib,-rpath,${ONEAPI_ROOT}/tbb/latest/lib,-rpath,${ONEAPI_ROOT}/compiler/latest/opt/oclfpga/linux64/lib -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
+    ONEAPI_DIST_DIR="${DIST_BASE}/ollama_libs"
+    export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
     DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
     build
 
     # copy oneAPI dependencies
+    mkdir -p "${ONEAPI_DIST_DIR}"
     for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do
-        cp "${dep}" "${BUILD_DIR}/bin/"
+        cp -a "${dep}" "${ONEAPI_DIST_DIR}"
     done
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${BUILD_DIR}/bin/"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${BUILD_DIR}/bin/"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${BUILD_DIR}/bin/"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${BUILD_DIR}/bin/"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${BUILD_DIR}/bin/"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${BUILD_DIR}/bin/"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${BUILD_DIR}/bin/"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${ONEAPI_DIST_DIR}"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${ONEAPI_DIST_DIR}"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${ONEAPI_DIST_DIR}"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${ONEAPI_DIST_DIR}"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${ONEAPI_DIST_DIR}"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
+    install
     compress
 fi
 
@@ -262,21 +258,18 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
         echo "Building custom ROCM GPU"
     fi
     BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
-    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
+    ROCM_DIST_DIR="${DIST_BASE}/ollama_libs"
+    # TODO figure out how to disable runpath (rpath)
+    # export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work
+    export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
     build
 
-    # Record the ROCM dependencies
-    rm -f "${BUILD_DIR}/bin/deps.txt"
-    touch "${BUILD_DIR}/bin/deps.txt"
-    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
-        echo "${dep}" >> "${BUILD_DIR}/bin/deps.txt"
+    # copy the ROCM dependencies
+    mkdir -p "${ROCM_DIST_DIR}"
+    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do
+        cp -a "${dep}"* "${ROCM_DIST_DIR}"
     done
-    # bomb out if for some reason we didn't get a few deps
-    if [ $(cat "${BUILD_DIR}/bin/deps.txt" | wc -l ) -lt 8 ] ; then
-        cat "${BUILD_DIR}/bin/deps.txt"
-        echo "ERROR: deps file short"
-        exit 1
-    fi
+    install
     compress
 fi
 
diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1
index d8bce92d..1f8c96d8 100644
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -286,12 +286,11 @@ function build_cuda() {
         sign
         install
 
-        rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
-        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null
-        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
-        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
-        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
-        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
+        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" -ea 0 > $null
+        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
     } else {
         write-host "Skipping CUDA generation step"
     }
@@ -325,18 +324,17 @@ function build_oneapi() {
     sign
     install
 
-    rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    md "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" -ea 0 > $null
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
   } else {
     Write-Host "Skipping oneAPI generation step"
   }
@@ -386,12 +384,11 @@ function build_rocm() {
         sign
         install
 
-        rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
-        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null
-        cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
-        cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
+        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\rocblas\library\" -ea 0 > $null
+        cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+        cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
         # amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
-        cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\"
+        cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\rocblas\library\"
     } else {
         write-host "Skipping ROCm generation step"
     }
diff --git a/llm/server.go b/llm/server.go
index d2b8db9b..9347a458 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -306,20 +306,18 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		if runtime.GOOS == "windows" {
 			pathEnv = "PATH"
 		}
-		// prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies
-		libraryPaths := []string{dir, filepath.Dir(dir)}
+		// Start with the server directory for the LD_LIBRARY_PATH/PATH
+		libraryPaths := []string{dir}
 
 		if libraryPath, ok := os.LookupEnv(pathEnv); ok {
-			// Append our runner directory to the path
-			// This will favor system libraries over our bundled library dependencies
+			// favor our bundled library dependencies over system libraries
 			libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
 		}
 
 		// Note: we always put the dependency path first
-		// since this was the exact version we verified for AMD GPUs
-		// and we favor what the user had in their path
+		// since this was the exact version we compiled/linked against
 		if gpus[0].DependencyPath != "" {
-			// TODO refine for multi-gpu support
+			// assume gpus from the same library have the same dependency path
 			libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...)
 		}
 
diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh
index 27c4ff1f..4ea51229 100755
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -21,11 +21,9 @@ for TARGETARCH in ${BUILD_ARCH}; do
         -t builder:$TARGETARCH \
         .
     docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
-    docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/ollama ./dist/ollama-linux-$TARGETARCH
-
-    if [ "$TARGETARCH" = "amd64" ]; then
-        docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/deps/ ./dist/
-    fi
-
+    docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist
     docker rm builder-$TARGETARCH
+    echo "Compressing final linux bundle..."
+    rm -f ./dist/ollama-linux-$TARGETARCH.tgz
+    (cd dist/linux-$TARGETARCH && tar cf - . | gzip --best > ../ollama-linux-$TARGETARCH.tgz )
 done
diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1
index edc73759..e8d851f4 100644
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -103,22 +103,22 @@ function buildApp() {
 function gatherDependencies() {
     write-host "Gathering runtime dependencies"
     cd "${script:SRC_DIR}"
-    md "${script:DEPS_DIR}\ollama_runners" -ea 0 > $null
+    md "${script:DEPS_DIR}\ollama_libs" -ea 0 > $null
 
     # TODO - this varies based on host build system and MSVC version - drive from dumpbin output
     # currently works for Win11 + MSVC 2019 + Cuda V11
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\ollama_runners\"
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_runners\"
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_runners\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\ollama_libs\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_libs\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_libs\"
     foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) {
-        cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\ollama_runners\"
+        cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\ollama_libs\"
     }
 
 
     cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
     if ("${env:KEY_CONTAINER}") {
         write-host "about to sign"
-        foreach ($file in (get-childitem "${script:DEPS_DIR}\cuda\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
+        foreach ($file in (get-childitem "${script:DEPS_DIR}\ollama_libs\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
             write-host "signing $file"
             & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
                 /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file
diff --git a/scripts/install.sh b/scripts/install.sh
index 03af5a69..f0439b00 100644
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -63,16 +63,32 @@ if [ -n "$NEEDS" ]; then
     exit 1
 fi
 
-status "Downloading ollama..."
-curl --fail --show-error --location --progress-bar -o $TEMP_DIR/ollama "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}"
-
 for BINDIR in /usr/local/bin /usr/bin /bin; do
     echo $PATH | grep -q $BINDIR && break || continue
 done
+OLLAMA_INSTALL_DIR=${OLLAMA_INSTALL_DIR:-${BINDIR}}
 
-status "Installing ollama to $BINDIR..."
+status "Installing ollama to $OLLAMA_INSTALL_DIR"
 $SUDO install -o0 -g0 -m755 -d $BINDIR
-$SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $BINDIR/ollama
+$SUDO install -o0 -g0 -m755 -d "$OLLAMA_INSTALL_DIR"
+if curl -I --silent --fail --location "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" >/dev/null ; then
+    status "Downloading Linux ${ARCH} bundle"
+    curl --fail --show-error --location --progress-bar \
+        "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" | \
+        $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
+    BUNDLE=1
+else
+    status "Downloading Linux ${ARCH} CLI"
+    curl --fail --show-error --location --progress-bar -o "$TEMP_DIR/ollama"\
+    "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}"
+    $SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $OLLAMA_INSTALL_DIR/ollama
+    BUNDLE=0
+fi
+
+if [ "$OLLAMA_INSTALL_DIR/ollama" != "$BINDIR/ollama" ] ; then
+    status "Making ollama accessible in the PATH in $BINDIR"
+    $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
+fi
 
 install_success() {
     status 'The Ollama API is now available at 127.0.0.1:11434.'
@@ -178,6 +194,11 @@ if ! check_gpu lspci nvidia && ! check_gpu lshw nvidia && ! check_gpu lspci amdg
 fi
 
 if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then
+    if [ $BUNDLE -ne 0 ]; then
+        install_success
+        status "AMD GPU ready."
+        exit 0
+    fi
     # Look for pre-existing ROCm v6 before downloading the dependencies
     for search in "${HIP_PATH:-''}" "${ROCM_PATH:-''}" "/opt/rocm" "/usr/lib64"; do
         if [ -n "${search}" ] && [ -e "${search}/libhipblas.so.2" -o -e "${search}/lib/libhipblas.so.2" ]; then

From c7bcb0031965e33531358639620a11516d101b54 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Fri, 9 Aug 2024 07:21:40 -0700
Subject: [PATCH 54/71] Wire up ccache and pigz in the docker based build

This should help speed things up a little
---
 Dockerfile                 | 37 ++++++++++++++++++++++++++-----------
 llm/generate/gen_common.sh | 15 +++++++++------
 llm/generate/gen_darwin.sh |  2 ++
 llm/generate/gen_linux.sh  |  2 ++
 scripts/build_linux.sh     |  3 ++-
 scripts/rh_linux_deps.sh   | 14 ++++++++++++--
 6 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 120ddc21..8eb90057 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -19,7 +19,8 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ENV GOARCH amd64 
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
 
 FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
 ARG CMAKE_VERSION
@@ -30,7 +31,12 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ENV GOARCH arm64 
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 \
+    OLLAMA_SKIP_CPU_GENERATE=1 \
+    CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
+    CUDA_VARIANT="_v11" \
+    bash gen_linux.sh
 
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
 ARG CMAKE_VERSION
@@ -43,7 +49,8 @@ WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG AMDGPU_TARGETS
 ENV GOARCH amd64 
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
 RUN mkdir -p ../../dist/linux-amd64/ollama_libs && \
     (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64/ollama_libs && tar xf - )
 
@@ -60,13 +67,17 @@ ENV GOARCH amd64
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 
 FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64
-RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_CPU_TARGET="static" bash gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" bash gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" bash gen_linux.sh
 
 FROM --platform=linux/arm64 rockylinux:8 AS cpu-builder-arm64
 ARG CMAKE_VERSION
@@ -81,9 +92,11 @@ ENV GOARCH arm64
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 
 FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64
-RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_CPU_TARGET="static" bash gen_linux.sh
 FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh
 
 
 # Intermediate stage used for ./scripts/build_linux.sh
@@ -100,7 +113,8 @@ COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 ARG GOFLAGS
 ARG CGO_CFLAGS
-RUN go build -trimpath -o dist/linux-amd64/ollama .
+RUN --mount=type=cache,target=/root/.ccache \
+    go build -trimpath -o dist/linux-amd64/ollama .
 
 # Intermediate stage used for ./scripts/build_linux.sh
 FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
@@ -113,7 +127,8 @@ COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 ARG GOFLAGS
 ARG CGO_CFLAGS
-RUN go build -trimpath -o dist/linux-arm64/ollama .
+RUN --mount=type=cache,target=/root/.ccache \
+    go build -trimpath -o dist/linux-arm64/ollama .
 
 # Runtime stages
 FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64
diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh
index f1541f2a..40115936 100644
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@@ -47,6 +47,7 @@ init_vars() {
     if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
         CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
     fi
+    GZIP=$(which pigz 2>/dev/null || echo "gzip")
 }
 
 git_module_setup() {
@@ -90,21 +91,23 @@ build() {
 
 compress() {
     echo "Compressing payloads to reduce overall binary size..."
-    pids=""
     rm -rf ${BUILD_DIR}/bin/*.gz
     for f in ${BUILD_DIR}/bin/* ; do
-        gzip -n --best -f ${f} &
-        pids+=" $!"
+        ${GZIP} -n --best -f ${f} &
+        compress_pids+=" $!"
     done
     # check for lib directory
     if [ -d ${BUILD_DIR}/lib ]; then
         for f in ${BUILD_DIR}/lib/* ; do
-            gzip -n --best -f ${f} &
-            pids+=" $!"
+            ${GZIP} -n --best -f ${f} &
+            compress_pids+=" $!"
         done
     fi
     echo
-    for pid in ${pids}; do
+}
+
+wait_for_compress() {
+    for pid in ${compress_pids}; do
         wait $pid
     done
     echo "Finished compression"
diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh
index 6c0b62cb..f22c0f8e 100755
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -6,6 +6,7 @@
 
 set -ex
 set -o pipefail
+compress_pids=""
 echo "Starting darwin generate script"
 source $(dirname $0)/gen_common.sh
 init_vars
@@ -98,4 +99,5 @@ case "${GOARCH}" in
 esac
 
 cleanup
+wait_for_compress
 echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index 70fc0313..1365d07d 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -13,6 +13,7 @@
 
 set -ex
 set -o pipefail
+compress_pids=""
 
 # See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
 amdGPUs() {
@@ -274,4 +275,5 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
 fi
 
 cleanup
+wait_for_compress
 echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh
index 4ea51229..ebb60c5a 100755
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -4,6 +4,7 @@ set -eu
 
 export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
+GZIP=$(which pigz 2>/dev/null || echo "gzip")
 
 BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
 export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""}
@@ -25,5 +26,5 @@ for TARGETARCH in ${BUILD_ARCH}; do
     docker rm builder-$TARGETARCH
     echo "Compressing final linux bundle..."
     rm -f ./dist/ollama-linux-$TARGETARCH.tgz
-    (cd dist/linux-$TARGETARCH && tar cf - . | gzip --best > ../ollama-linux-$TARGETARCH.tgz )
+    (cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz )
 done
diff --git a/scripts/rh_linux_deps.sh b/scripts/rh_linux_deps.sh
index 81648d68..b4c9afd6 100644
--- a/scripts/rh_linux_deps.sh
+++ b/scripts/rh_linux_deps.sh
@@ -3,6 +3,7 @@
 # Script for common Dockerfile dependency installation in redhat linux based images
 
 set -ex
+set -o pipefail
 MACHINE=$(uname -m)
 
 if grep -i "centos" /etc/system-release >/dev/null; then
@@ -29,7 +30,7 @@ if grep -i "centos" /etc/system-release >/dev/null; then
         dnf install -y rh-git227-git
         ln -s /opt/rh/rh-git227/root/usr/bin/git /usr/local/bin/git
     fi
-    dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++
+    dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++ pigz
 elif grep -i "rocky" /etc/system-release >/dev/null; then
     # Temporary workaround until rocky 8 AppStream ships GCC 10.4 (10.3 is incompatible with NVCC)
     cat << EOF > /etc/yum.repos.d/Rocky-Vault.repo
@@ -43,12 +44,21 @@ gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-rockyofficial
 EOF
     dnf install -y git \
         gcc-toolset-10-gcc-10.2.1-8.2.el8 \
-        gcc-toolset-10-gcc-c++-10.2.1-8.2.el8
+        gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 \
+        pigz
 else
     echo "ERROR Unexpected distro"
     exit 1
 fi
 
+if [ "${MACHINE}" = "x86_64" ] ; then
+    curl -s -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz | tar -Jx -C /tmp --strip-components 1 && \
+    mv /tmp/ccache /usr/local/bin/
+else
+    yum -y install epel-release
+    yum install -y ccache
+fi
+
 if [ -n "${CMAKE_VERSION}" ]; then
     curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
 fi

From d470ebe78bc76c098bc378f98f08f7094063ab4d Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Thu, 30 May 2024 21:54:07 -0700
Subject: [PATCH 55/71] Add Jetson cuda variants for arm

This adds new variants for arm64 specific to Jetson platforms
---
 Dockerfile                | 48 +++++++++++++++++++++++++++++++++++----
 gpu/gpu.go                | 44 +++++++++++++++++++++++++++++++++--
 gpu/gpu_darwin.go         |  4 ++--
 gpu/types.go              |  6 ++---
 llm/generate/gen_linux.sh |  5 ++--
 llm/payload.go            |  4 ++--
 scripts/build_linux.sh    |  1 +
 7 files changed, 96 insertions(+), 16 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 8eb90057..79b2a696 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,6 +3,9 @@ ARG CMAKE_VERSION=3.22.1
 # this CUDA_VERSION corresponds with the one specified in docs/gpu.md
 ARG CUDA_VERSION=11.3.1
 ARG ROCM_VERSION=6.1.2
+ARG JETPACK_6=r36.2.0
+ARG JETPACK_5=r35.4.1
+ARG JETPACK_4=r32.7.1
 
 # Copy the minimal context we need to run the generate scripts
 FROM scratch AS llm-code
@@ -22,7 +25,7 @@ ENV GOARCH amd64
 RUN --mount=type=cache,target=/root/.ccache \
     OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
 
-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
+FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-server-arm64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@@ -31,11 +34,40 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ENV GOARCH arm64 
+RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
+
+FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS cuda-build-jetpack6-arm64
+ARG CMAKE_VERSION
+RUN apt-get update && apt-get install -y git curl && \
+    curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
+COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+ARG CGO_CFLAGS
+ENV GOARCH arm64
+ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
 RUN --mount=type=cache,target=/root/.ccache \
     OLLAMA_SKIP_STATIC_GENERATE=1 \
     OLLAMA_SKIP_CPU_GENERATE=1 \
-    CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
-    CUDA_VARIANT="_v11" \
+    CUDA_VARIANT="_jetpack6" \
+    CUDA_DIST_DIR="/go/src/github.com/ollama/ollama/dist/linux-arm64/ollama_libs/cuda_jetpack6" \
+    CMAKE_CUDA_ARCHITECTURES="87" \
+    bash gen_linux.sh
+
+FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS cuda-build-jetpack5-arm64
+ARG CMAKE_VERSION
+RUN apt-get update && apt-get install -y git curl && \
+    curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
+COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+ARG CGO_CFLAGS
+ENV GOARCH arm64
+ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 \
+    OLLAMA_SKIP_CPU_GENERATE=1 \
+    CUDA_VARIANT="_jetpack5" \
+    CUDA_DIST_DIR="/go/src/github.com/ollama/ollama/dist/linux-arm64/ollama_libs/cuda_jetpack5" \
+    CMAKE_CUDA_ARCHITECTURES="72;87" \
     bash gen_linux.sh
 
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
@@ -123,8 +155,14 @@ ARG GOLANG_VERSION
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+## arm binary += 381M 
+COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+## arm binary += 330M
+COPY --from=cuda-build-jetpack5-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-build-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
diff --git a/gpu/gpu.go b/gpu/gpu.go
index d0ae0f34..22461922 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -15,7 +15,9 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
+	"regexp"
 	"runtime"
+	"strconv"
 	"strings"
 	"sync"
 	"unsafe"
@@ -215,7 +217,7 @@ func GetGPUInfo() GpuInfoList {
 				GpuInfo: GpuInfo{
 					memInfo: mem,
 					Library: "cpu",
-					Variant: cpuCapability,
+					Variant: cpuCapability.String(),
 					ID:      "0",
 				},
 			},
@@ -231,6 +233,35 @@ func GetGPUInfo() GpuInfoList {
 
 		depPath := GetDepDir()
 
+		var cudaVariant string
+		if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
+			if CudaTegra != "" {
+				ver := strings.Split(CudaTegra, ".")
+				if len(ver) > 0 {
+					cudaVariant = "jetpack" + ver[0]
+				}
+			} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
+				r := regexp.MustCompile(` R(\d+) `)
+				m := r.FindSubmatch(data)
+				if len(m) != 2 {
+					slog.Info("Unexpected format for /etc/nv_tegra_release.  Set JETSON_JETPACK to select version")
+				} else {
+					if l4t, err := strconv.Atoi(string(m[1])); err == nil {
+						// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
+						// https://developer.nvidia.com/embedded/jetpack-archive
+						switch l4t {
+						case 35:
+							cudaVariant = "jetpack5"
+						case 36:
+							cudaVariant = "jetpack6"
+						default:
+							slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
+						}
+					}
+				}
+			}
+		}
+
 		// Load ALL libraries
 		cHandles = initCudaHandles()
 
@@ -240,6 +271,7 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo := CudaGPUInfo{
 					GpuInfo: GpuInfo{
 						Library: "cuda",
+						Variant: cudaVariant,
 					},
 					index: i,
 				}
@@ -266,7 +298,15 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 				gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
 				gpuInfo.MinimumMemory = cudaMinimumMemory
-				gpuInfo.DependencyPath = depPath
+				if depPath != "" {
+					gpuInfo.DependencyPath = depPath
+					// Check for variant specific directory
+					if cudaVariant != "" {
+						if _, err := os.Stat(filepath.Join(depPath, "cuda_"+cudaVariant)); err == nil {
+							gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+cudaVariant)
+						}
+					}
+				}
 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
 				gpuInfo.DriverMajor = driverMajor
 				gpuInfo.DriverMinor = driverMinor
diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go
index 9d9fd84e..417b48df 100644
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -25,7 +25,7 @@ func GetGPUInfo() GpuInfoList {
 		return []GpuInfo{
 			{
 				Library: "cpu",
-				Variant: GetCPUCapability(),
+				Variant: GetCPUCapability().String(),
 				memInfo: mem,
 			},
 		}
@@ -48,7 +48,7 @@ func GetCPUInfo() GpuInfoList {
 	return []GpuInfo{
 		{
 			Library: "cpu",
-			Variant: GetCPUCapability(),
+			Variant: GetCPUCapability().String(),
 			memInfo: mem,
 		},
 	}
diff --git a/gpu/types.go b/gpu/types.go
index 8d22b06b..fc628d47 100644
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -19,7 +19,7 @@ type GpuInfo struct {
 	Library string `json:"library,omitempty"`
 
 	// Optional variant to select (e.g. versions, cpu feature flags)
-	Variant CPUCapability `json:"variant"`
+	Variant string `json:"variant"`
 
 	// MinimumMemory represents the minimum memory required to use the GPU
 	MinimumMemory uint64 `json:"-"`
@@ -81,8 +81,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	for _, info := range l {
 		found := false
 		requested := info.Library
-		if info.Variant != CPUCapabilityNone {
-			requested += "_" + info.Variant.String()
+		if info.Variant != CPUCapabilityNone.String() {
+			requested += "_" + info.Variant
 		}
 		for i, lib := range libs {
 			if lib == requested {
diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index 1365d07d..dc9dda5a 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -165,7 +165,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
     echo "CUDA libraries detected - building dynamic CUDA library"
     init_vars
     CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
-    if [ -n "${CUDA_MAJOR}" ]; then
+    if [ -n "${CUDA_MAJOR}" -a -z "${CUDA_VARIANT}" ]; then
         CUDA_VARIANT=_v${CUDA_MAJOR}
     fi
     if [ "${ARCH}" == "arm64" ]; then
@@ -189,9 +189,10 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
     CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
     BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
     export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
-    CUDA_DIST_DIR="${DIST_BASE}/ollama_libs"
+    CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/ollama_libs}"
     build
     install
+    echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
     mkdir -p "${CUDA_DIST_DIR}"
     for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
         cp -a "${lib}" "${CUDA_DIST_DIR}"
diff --git a/llm/payload.go b/llm/payload.go
index b402e1f2..963b3295 100644
--- a/llm/payload.go
+++ b/llm/payload.go
@@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string {
 	// glob workDir for files that start with ollama_
 	availableServers := getAvailableServers()
 	requested := info.Library
-	if info.Variant != gpu.CPUCapabilityNone {
-		requested += "_" + info.Variant.String()
+	if info.Variant != gpu.CPUCapabilityNone.String() {
+		requested += "_" + info.Variant
 	}
 
 	servers := []string{}
diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh
index ebb60c5a..adda2ad7 100755
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -22,6 +22,7 @@ for TARGETARCH in ${BUILD_ARCH}; do
         -t builder:$TARGETARCH \
         .
     docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
+    rm -rf ./dist/linux-$TARGETARCH
     docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist
     docker rm builder-$TARGETARCH
     echo "Compressing final linux bundle..."

From fc3b4cda89f468f923e2e6095c6a62a5c3c336ff Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Wed, 19 Jun 2024 09:36:30 -0700
Subject: [PATCH 56/71] Report GPU variant in log

---
 gpu/types.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gpu/types.go b/gpu/types.go
index fc628d47..88539078 100644
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -105,6 +105,7 @@ func (l GpuInfoList) LogDetails() {
 		slog.Info("inference compute",
 			"id", g.ID,
 			"library", g.Library,
+			"variant", g.Variant,
 			"compute", g.Compute,
 			"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
 			"name", g.Name,

From 4fe3a556faf790ba993223cfdda16e281b6cb76d Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Thu, 13 Jun 2024 20:46:14 -0700
Subject: [PATCH 57/71] Add cuda v12 variant and selection logic

Based on compute capability and driver version, pick
v12 or v11 cuda variants.
---
 Dockerfile         | 43 +++++++++++++++++++++++++++++++++----------
 gpu/cuda_common.go | 43 +++++++++++++++++++++++++++++++++++++++++++
 gpu/gpu.go         | 40 ++++------------------------------------
 gpu/types.go       |  6 ++++--
 4 files changed, 84 insertions(+), 48 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 79b2a696..e200f5d4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,7 @@
 ARG GOLANG_VERSION=1.22.5
 ARG CMAKE_VERSION=3.22.1
-# this CUDA_VERSION corresponds with the one specified in docs/gpu.md
-ARG CUDA_VERSION=11.3.1
+ARG CUDA_VERSION_11=11.3.1
+ARG CUDA_VERSION_12=12.4.0
 ARG ROCM_VERSION=6.1.2
 ARG JETPACK_6=r36.2.0
 ARG JETPACK_5=r35.4.1
@@ -13,7 +13,7 @@ COPY .git .git
 COPY .gitmodules .gitmodules
 COPY llm llm
 
-FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
+FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_11-devel-centos7 AS cuda-11-build-amd64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@@ -23,9 +23,29 @@ WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ENV GOARCH amd64 
 RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
+    OLLAMA_SKIP_STATIC_GENERATE=1 \
+    OLLAMA_SKIP_CPU_GENERATE=1 \
+    CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86" \
+    CUDA_VARIANT="_v11" \
+    bash gen_linux.sh
 
-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-server-arm64
+FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_12-devel-centos7 AS cuda-12-build-amd64
+ARG CMAKE_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
+COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+ARG CGO_CFLAGS
+ENV GOARCH amd64 
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 \
+    OLLAMA_SKIP_CPU_GENERATE=1 \
+    CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a" \
+    CUDA_VARIANT="_v12" \
+    bash gen_linux.sh
+
+FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@@ -34,7 +54,8 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ENV GOARCH arm64 
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
 
 FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS cuda-build-jetpack6-arm64
 ARG CMAKE_VERSION
@@ -139,8 +160,10 @@ COPY . .
 COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 ARG GOFLAGS
@@ -155,8 +178,8 @@ ARG GOLANG_VERSION
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 ## arm binary += 381M 
 COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
diff --git a/gpu/cuda_common.go b/gpu/cuda_common.go
index c90a644c..defaa60a 100644
--- a/gpu/cuda_common.go
+++ b/gpu/cuda_common.go
@@ -4,9 +4,17 @@ package gpu
 
 import (
 	"log/slog"
+	"os"
+	"regexp"
+	"runtime"
+	"strconv"
 	"strings"
 )
 
+// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
+// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
+var CudaTegra string = os.Getenv("JETSON_JETPACK")
+
 func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
 	ids := []string{}
 	for _, info := range gpuInfo {
@@ -19,3 +27,38 @@ func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
 	}
 	return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
 }
+
+func cudaGetVariant(gpuInfo CudaGPUInfo) string {
+	if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
+		if CudaTegra != "" {
+			ver := strings.Split(CudaTegra, ".")
+			if len(ver) > 0 {
+				return "jetpack" + ver[0]
+			}
+		} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
+			r := regexp.MustCompile(` R(\d+) `)
+			m := r.FindSubmatch(data)
+			if len(m) != 2 {
+				slog.Info("Unexpected format for /etc/nv_tegra_release.  Set JETSON_JETPACK to select version")
+			} else {
+				if l4t, err := strconv.Atoi(string(m[1])); err == nil {
+					// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
+					// https://developer.nvidia.com/embedded/jetpack-archive
+					switch l4t {
+					case 35:
+						return "jetpack5"
+					case 36:
+						return "jetpack6"
+					default:
+						slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
+					}
+				}
+			}
+		}
+	}
+
+	if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 {
+		return "v11"
+	}
+	return "v12"
+}
diff --git a/gpu/gpu.go b/gpu/gpu.go
index 22461922..eb87807a 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -15,9 +15,7 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
-	"regexp"
 	"runtime"
-	"strconv"
 	"strings"
 	"sync"
 	"unsafe"
@@ -66,10 +64,6 @@ var RocmComputeMin = 9
 // TODO find a better way to detect iGPU instead of minimum memory
 const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
 
-// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
-// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
-var CudaTegra string = os.Getenv("JETSON_JETPACK")
-
 // Note: gpuMutex must already be held
 func initCudaHandles() *cudaHandles {
 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
@@ -233,35 +227,6 @@ func GetGPUInfo() GpuInfoList {
 
 		depPath := GetDepDir()
 
-		var cudaVariant string
-		if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
-			if CudaTegra != "" {
-				ver := strings.Split(CudaTegra, ".")
-				if len(ver) > 0 {
-					cudaVariant = "jetpack" + ver[0]
-				}
-			} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
-				r := regexp.MustCompile(` R(\d+) `)
-				m := r.FindSubmatch(data)
-				if len(m) != 2 {
-					slog.Info("Unexpected format for /etc/nv_tegra_release.  Set JETSON_JETPACK to select version")
-				} else {
-					if l4t, err := strconv.Atoi(string(m[1])); err == nil {
-						// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
-						// https://developer.nvidia.com/embedded/jetpack-archive
-						switch l4t {
-						case 35:
-							cudaVariant = "jetpack5"
-						case 36:
-							cudaVariant = "jetpack6"
-						default:
-							slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
-						}
-					}
-				}
-			}
-		}
-
 		// Load ALL libraries
 		cHandles = initCudaHandles()
 
@@ -271,7 +236,6 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo := CudaGPUInfo{
 					GpuInfo: GpuInfo{
 						Library: "cuda",
-						Variant: cudaVariant,
 					},
 					index: i,
 				}
@@ -297,7 +261,10 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.FreeMemory = uint64(memInfo.free)
 				gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 				gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
+				gpuInfo.computeMajor = int(memInfo.major)
+				gpuInfo.computeMinor = int(memInfo.minor)
 				gpuInfo.MinimumMemory = cudaMinimumMemory
+				cudaVariant := cudaGetVariant(gpuInfo)
 				if depPath != "" {
 					gpuInfo.DependencyPath = depPath
 					// Check for variant specific directory
@@ -310,6 +277,7 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
 				gpuInfo.DriverMajor = driverMajor
 				gpuInfo.DriverMinor = driverMinor
+				gpuInfo.Variant = cudaGetVariant(gpuInfo)
 
 				// query the management library as well so we can record any skew between the two
 				// which represents overhead on the GPU we must set aside on subsequent updates
diff --git a/gpu/types.go b/gpu/types.go
index 88539078..4cbbeb84 100644
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -53,8 +53,10 @@ type CPUInfo struct {
 
 type CudaGPUInfo struct {
 	GpuInfo
-	OSOverhead uint64 // Memory overhead between the driver library and management library
-	index      int    //nolint:unused,nolintlint
+	OSOverhead   uint64 // Memory overhead between the driver library and management library
+	index        int    //nolint:unused,nolintlint
+	computeMajor int    //nolint:unused,nolintlint
+	computeMinor int    //nolint:unused,nolintlint
 }
 type CudaGPUInfoList []CudaGPUInfo
 

From f6c811b32075cb3b7633d7d4213251d474a77682 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Fri, 12 Jul 2024 11:35:41 -0700
Subject: [PATCH 58/71] Enable cuda v12 flags

---
 Dockerfile | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index e200f5d4..e83a266a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,9 @@
 ARG GOLANG_VERSION=1.22.5
 ARG CMAKE_VERSION=3.22.1
 ARG CUDA_VERSION_11=11.3.1
+ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
 ARG CUDA_VERSION_12=12.4.0
+ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
 ARG ROCM_VERSION=6.1.2
 ARG JETPACK_6=r36.2.0
 ARG JETPACK_5=r35.4.1
@@ -21,11 +23,12 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
+ARG CUDA_V11_ARCHITECTURES
 ENV GOARCH amd64 
 RUN --mount=type=cache,target=/root/.ccache \
     OLLAMA_SKIP_STATIC_GENERATE=1 \
     OLLAMA_SKIP_CPU_GENERATE=1 \
-    CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86" \
+    CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
     CUDA_VARIANT="_v11" \
     bash gen_linux.sh
 
@@ -37,12 +40,14 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
+ARG CUDA_V12_ARCHITECTURES
 ENV GOARCH amd64 
 RUN --mount=type=cache,target=/root/.ccache \
     OLLAMA_SKIP_STATIC_GENERATE=1 \
     OLLAMA_SKIP_CPU_GENERATE=1 \
-    CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a" \
+    CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \
     CUDA_VARIANT="_v12" \
+    OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
     bash gen_linux.sh
 
 FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64
@@ -53,9 +58,31 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
+ARG CUDA_V11_ARCHITECTURES
+ENV GOARCH arm64 
+RUN OLLAMA_SKIP_STATIC_GENERATE=1 \
+    OLLAMA_SKIP_CPU_GENERATE=1 \
+    CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
+    CUDA_VARIANT="_v11" \
+    bash gen_linux.sh
+
+FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-server-arm64
+ARG CMAKE_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
+COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+ARG CGO_CFLAGS
+ARG CUDA_V12_ARCHITECTURES
 ENV GOARCH arm64 
 RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
+    OLLAMA_SKIP_STATIC_GENERATE=1 \
+    OLLAMA_SKIP_CPU_GENERATE=1 \
+    CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \
+    CUDA_VARIANT="_v12" \
+    OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
+    bash gen_linux.sh
 
 FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS cuda-build-jetpack6-arm64
 ARG CMAKE_VERSION
@@ -180,6 +207,8 @@ COPY . .
 COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 ## arm binary += 381M 
 COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/

From 927d98a6cde43ffee3ef269cf013df5e96cbe767 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Fri, 12 Jul 2024 14:33:13 -0700
Subject: [PATCH 59/71] Add windows cuda v12 + v11 support

---
 .github/workflows/release.yaml | 93 ++++++++++++++++++++++++++++++++--
 llm/generate/gen_windows.ps1   |  6 +--
 scripts/build_windows.ps1      | 63 ++++++++++++++++++-----
 3 files changed, 142 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 9287f6f7..4bd68455 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -183,8 +183,8 @@ jobs:
           name: windows-rocm-deps
           path: dist/deps/*
 
-  # CUDA generation step
-  generate-windows-cuda:
+  # CUDA v11 generation step
+  generate-windows-cuda-v11:
     environment: release
     runs-on: windows
     env:
@@ -256,7 +256,89 @@ jobs:
           cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
       - uses: actions/upload-artifact@v4
         with:
-          name: generate-windows-cuda
+          name: generate-windows-cuda-v11
+          path: |
+            llm/build/**/bin/*
+            dist/windows-amd64/**
+      - uses: actions/upload-artifact@v4
+        with:
+          name: windows-cuda-deps
+          path: dist/deps/*
+
+  # CUDA v12 generation step
+  generate-windows-cuda-v12:
+    environment: release
+    runs-on: windows
+    env:
+      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set Version
+        shell: bash
+        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
+      - uses: 'google-github-actions/auth@v2'
+        with:
+          project_id: 'ollama'
+          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
+      - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
+      - name: install Windows SDK 8.1 to get signtool
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "downloading SDK"
+          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
+          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
+          write-host "Win SDK 8.1 installed"
+          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
+      - name: install signing plugin
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "downloading plugin"
+          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
+          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
+          write-host "Installing plugin"
+          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
+          write-host "plugin installed"
+      - uses: actions/setup-go@v5
+        with:
+          go-version-file: go.mod
+          cache: true
+      - name: 'Install CUDA'
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "downloading CUDA Installer"
+          Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
+          write-host "Installing CUDA"
+          Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
+          write-host "Completed CUDA"
+          $cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
+          $cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2' 
+          echo "$cudaPath\bin" >> $env:GITHUB_PATH
+          echo "CUDA_PATH=$cudaPath" >> $env:GITHUB_ENV
+          echo "CUDA_PATH_V${cudaVer}=$cudaPath" >> $env:GITHUB_ENV
+          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV
+      - name: 'Verify CUDA'
+        run: nvcc -V
+      - run: go get ./...
+      - name: go generate
+        run: |
+          $gopath=(get-command go).source | split-path -parent
+          $cudabin=(get-command nvcc).source | split-path
+          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
+          cd $env:GITHUB_WORKSPACE
+          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
+          $env:PATH="$gopath;$cudabin;$env:PATH"
+          $env:OLLAMA_SKIP_CPU_GENERATE="1"
+          go generate -x ./...
+      - name: 'gather cuda dependencies'
+        run: |
+          $NVIDIA_DIR=(resolve-path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*\bin\')[0]
+          md "dist\deps"
+          cp "${NVIDIA_DIR}\cudart64_*.dll" "dist\deps\"
+          cp "${NVIDIA_DIR}\cublas64_*.dll" "dist\deps\"
+          cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
+      - uses: actions/upload-artifact@v4
+        with:
+          name: generate-windows-cuda-v12
           path: |
             llm/build/**/bin/*
             dist/windows-amd64/**
@@ -270,7 +352,8 @@ jobs:
     environment: release
     runs-on: windows
     needs:
-      - generate-windows-cuda
+      - generate-windows-cuda-v11
+      - generate-windows-cuda-v12
       - generate-windows-rocm
       - generate-windows-cpu
     env:
@@ -314,7 +397,7 @@ jobs:
           name: generate-windows-cpu
       - uses: actions/download-artifact@v4
         with:
-          name: generate-windows-cuda
+          name: generate-windows-cuda-v11
       - uses: actions/download-artifact@v4
         with:
           name: windows-cuda-deps
diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1
index 1f8c96d8..42708d3e 100644
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -261,7 +261,7 @@ function build_cuda() {
     if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) {
         # Then build cuda as a dynamically loaded library
         $nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
-        $script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
+        $script:CUDA_VERSION=((get-item ($nvcc | split-path | split-path)).Basename -Split "\.")[0]
         if ($null -ne $script:CUDA_VERSION) {
             $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
         }
@@ -273,9 +273,9 @@ function build_cuda() {
             "-DGGML_CUDA=ON",
             "-DGGML_AVX=on",
             "-DGGML_AVX2=off",
-            "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
             "-DCMAKE_CUDA_FLAGS=-t8",
-            "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
+            "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}",
+            "-DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=$env:CUDA_PATH"
             )
         if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
             write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1
index e8d851f4..50b60230 100644
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -7,6 +7,7 @@
 $ErrorActionPreference = "Stop"
 
 function checkEnv() {
+    $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
     $script:TARGET_ARCH=$Env:PROCESSOR_ARCHITECTURE.ToLower()
     Write-host "Building for ${script:TARGET_ARCH}"
     write-host "Locating required tools and paths"
@@ -15,26 +16,23 @@ function checkEnv() {
         $MSVC_INSTALL=(Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation
         $env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0]
     }
-    # Try to find the CUDA dir
-    if ($null -eq $env:NVIDIA_DIR) {
+    # Locate CUDA versions
+    # Note: this assumes every version found will be built
+    $cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue')
+    if ($cudaList.length -eq 0) {
         $d=(get-command -ea 'silentlycontinue' nvcc).path
-        if ($d -ne $null) {
-            $script:NVIDIA_DIR=($d| split-path -parent)
-        } else {
-            $cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue')
-            if ($cudaList.length > 0) {
-                $script:NVIDIA_DIR=$cudaList[0]
-            }
+        if ($null -ne $d) {
+            $script:CUDA_DIRS=@($d| split-path -parent)
         }
     } else {
-        $script:NVIDIA_DIR=$env:NVIDIA_DIR
+        $script:CUDA_DIRS=$cudaList
     }
     
     $script:INNO_SETUP_DIR=(get-item "C:\Program Files*\Inno Setup*\")[0]
 
     $script:DEPS_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}"
     $env:CGO_ENABLED="1"
-    echo "Checking version"
+    Write-Output "Checking version"
     if (!$env:VERSION) {
         $data=(git describe --tags --first-parent --abbrev=7 --long --dirty --always)
         $pattern="v(.+)"
@@ -71,7 +69,48 @@ function checkEnv() {
 function buildOllama() {
     write-host "Building ollama CLI"
     if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) {
-        & go generate ./...
+        Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
+
+        # TODO - consider trying to parallelize this with Start-ThreadJob, but env vars can't be used to toggle
+        #        which targets to build
+
+        # Start by skipping CUDA to build everything else
+        pwsh -Command { $env:OLLAMA_SKIP_CUDA_GENERATE="1"; & go generate ./... }
+        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}    
+
+        # Then skip everyhting else and build all the CUDA variants
+        foreach ($env:CUDA_LIB_DIR in $script:CUDA_DIRS) {
+            write-host "Building CUDA ${env:CUDA_LIB_DIR}"
+
+            if ($env:CUDA_LIB_DIR.Contains("v12")) {
+                pwsh -Command {
+                    $env:OLLAMA_SKIP_CUDA_GENERATE=""
+                    $env:OLLAMA_SKIP_STATIC_GENERATE="1"
+                    $env:OLLAMA_SKIP_CPU_GENERATE="1"
+                    $env:OLLAMA_SKIP_ONEAPI_GENERATE="1"
+                    $env:OLLAMA_SKIP_ROCM_GENERATE="1"
+                    $env:CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
+                    $env:OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on"
+                    $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent
+                    $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH"
+                    & go generate ./...
+                }
+            } else {
+                pwsh -Command {
+                    $env:OLLAMA_SKIP_CUDA_GENERATE=""
+                    $env:OLLAMA_SKIP_STATIC_GENERATE="1"
+                    $env:OLLAMA_SKIP_CPU_GENERATE="1"
+                    $env:OLLAMA_SKIP_ONEAPI_GENERATE="1"
+                    $env:OLLAMA_SKIP_ROCM_GENERATE="1"
+                    $env:CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
+                    $env:OLLAMA_CUSTOM_CUDA_DEFS=""
+                    $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent
+                    $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH"
+                    & go generate ./...
+                }
+            }
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+        }
         if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}    
     } else {
         write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"

From 3b19cdba2a090772b2e886dbfbf712992fafe0cd Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Tue, 13 Aug 2024 13:30:28 -0700
Subject: [PATCH 60/71] Remove Jetpack

---
 Dockerfile | 42 ------------------------------------------
 1 file changed, 42 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index e83a266a..99ba5b65 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -5,9 +5,6 @@ ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
 ARG CUDA_VERSION_12=12.4.0
 ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
 ARG ROCM_VERSION=6.1.2
-ARG JETPACK_6=r36.2.0
-ARG JETPACK_5=r35.4.1
-ARG JETPACK_4=r32.7.1
 
 # Copy the minimal context we need to run the generate scripts
 FROM scratch AS llm-code
@@ -84,39 +81,6 @@ RUN --mount=type=cache,target=/root/.ccache \
     OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
     bash gen_linux.sh
 
-FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS cuda-build-jetpack6-arm64
-ARG CMAKE_VERSION
-RUN apt-get update && apt-get install -y git curl && \
-    curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
-ARG CGO_CFLAGS
-ENV GOARCH arm64
-ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 \
-    OLLAMA_SKIP_CPU_GENERATE=1 \
-    CUDA_VARIANT="_jetpack6" \
-    CUDA_DIST_DIR="/go/src/github.com/ollama/ollama/dist/linux-arm64/ollama_libs/cuda_jetpack6" \
-    CMAKE_CUDA_ARCHITECTURES="87" \
-    bash gen_linux.sh
-
-FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS cuda-build-jetpack5-arm64
-ARG CMAKE_VERSION
-RUN apt-get update && apt-get install -y git curl && \
-    curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
-ARG CGO_CFLAGS
-ENV GOARCH arm64
-ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 \
-    OLLAMA_SKIP_CPU_GENERATE=1 \
-    CUDA_VARIANT="_jetpack5" \
-    CUDA_DIST_DIR="/go/src/github.com/ollama/ollama/dist/linux-arm64/ollama_libs/cuda_jetpack5" \
-    CMAKE_CUDA_ARCHITECTURES="72;87" \
-    bash gen_linux.sh
 
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
 ARG CMAKE_VERSION
@@ -209,12 +173,6 @@ COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ di
 COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-## arm binary += 381M 
-COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-## arm binary += 330M
-COPY --from=cuda-build-jetpack5-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-build-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \

From 88bb9e332877dfbba40030c19570fdbe00f41a21 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Wed, 14 Aug 2024 16:32:57 -0700
Subject: [PATCH 61/71] Adjust layout to bin+lib/ollama

---
 Dockerfile                   | 23 ++++++++++++++------
 app/ollama.iss               | 12 +++++------
 docs/linux.md                | 10 ++++-----
 envconfig/config.go          |  6 +++---
 gpu/amd_common.go            |  2 +-
 gpu/amd_windows.go           |  2 +-
 gpu/gpu.go                   |  4 ++--
 llm/generate/gen_linux.sh    |  6 +++---
 llm/generate/gen_windows.ps1 | 42 ++++++++++++++++++------------------
 scripts/build_windows.ps1    | 16 +++++++-------
 scripts/install.sh           | 14 +++++++-----
 11 files changed, 74 insertions(+), 63 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 99ba5b65..d4b86918 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -95,8 +95,8 @@ ARG AMDGPU_TARGETS
 ENV GOARCH amd64 
 RUN --mount=type=cache,target=/root/.ccache \
     OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
-RUN mkdir -p ../../dist/linux-amd64/ollama_libs && \
-    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64/ollama_libs && tar xf - )
+RUN mkdir -p ../../dist/linux-amd64/lib/ollama && \
+    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64/lib/ollama && tar xf - )
 
 FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
 ARG CMAKE_VERSION
@@ -160,7 +160,7 @@ COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ l
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-amd64/ollama .
+    go build -trimpath -o dist/linux-amd64/bin/ollama .
 
 # Intermediate stage used for ./scripts/build_linux.sh
 FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
@@ -176,20 +176,29 @@ COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/llm/buil
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-arm64/ollama .
+    go build -trimpath -o dist/linux-arm64/bin/ollama .
+
+# Strip out ROCm dependencies to keep the primary image lean
+FROM --platform=linux/amd64 ubuntu:22.04 as amd64-libs-without-rocm
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /scratch/
+RUN cd /scratch/ollama/ && rm -rf rocblas libamd* libdrm* libroc* libhip* libhsa* 
 
 # Runtime stages
 FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64
+COPY --from=amd64-libs-without-rocm /scratch/ /lib/
 RUN apt-get update && apt-get install -y ca-certificates
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/ollama /bin/ollama
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
+
 FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64
+COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
 RUN apt-get update && apt-get install -y ca-certificates
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/ollama /bin/ollama
+COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
 
 # Radeon images are much larger so we keep it distinct from the CPU/CUDA image
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete as runtime-rocm
 RUN update-pciids
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/ollama /bin/ollama
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
+RUN ln -s /opt/rocm/lib /lib/ollama
 EXPOSE 11434
 ENV OLLAMA_HOST 0.0.0.0
 
diff --git a/app/ollama.iss b/app/ollama.iss
index e9cf48ec..bce0a337 100644
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -87,11 +87,11 @@ DialogFontSize=12
 
 [Files]
 Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
-Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
-Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
+Source: "..\ollama.exe"; DestDir: "{app}\bin"; Flags: ignoreversion 64bit
+Source: "..\dist\windows-{#ARCH}\lib\ollama\runners\*"; DestDir: "{app}\lib\ollama\runners"; Flags: ignoreversion 64bit recursesubdirs
 Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
 Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
-Source: "..\dist\windows-amd64\ollama_libs\*"; DestDir: "{app}\ollama_libs\"; Flags: ignoreversion recursesubdirs
+Source: "..\dist\windows-amd64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Flags: ignoreversion recursesubdirs
 
 [Icons]
 Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
@@ -99,7 +99,7 @@ Name: "{userstartup}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilen
 Name: "{userprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
 
 [Run]
-Filename: "{cmd}"; Parameters: "/C set PATH={app};%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden
+Filename: "{cmd}"; Parameters: "/C set PATH={app}\bin;%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden
 
 [UninstallRun]
 ; Filename: "{cmd}"; Parameters: "/C ""taskkill /im ''{#MyAppExeName}'' /f /t"; Flags: runhidden
@@ -134,8 +134,8 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi
 
 [Registry]
 Root: HKCU; Subkey: "Environment"; \
-    ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}"; \
-    Check: NeedsAddPath('{app}')
+    ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}\bin"; \
+    Check: NeedsAddPath('{app}\bin')
 
 [Code]
 
diff --git a/docs/linux.md b/docs/linux.md
index ec730656..3ed2bed0 100644
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -20,13 +20,12 @@ GPU.
 
 ## Manual install
 
-### Download the `ollama` binary
+### Download the `ollama` tar file
 
-Ollama is distributed as a self-contained binary. Download it to a directory in your PATH:
+Ollama is distributed as a tar file including GPU library dependencies.
 
 ```bash
-sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
-sudo chmod +x /usr/bin/ollama
+curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar -C /usr -zxf -
 ```
 
 ### Adding Ollama as a startup service (recommended)
@@ -96,8 +95,7 @@ curl -fsSL https://ollama.com/install.sh | sh
 Or by downloading the ollama binary:
 
 ```bash
-sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
-sudo chmod +x /usr/bin/ollama
+curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar -C /usr -zxf -
 ```
 
 ## Installing specific versions
diff --git a/envconfig/config.go b/envconfig/config.go
index 7f0976c0..7e45a4f5 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -174,7 +174,7 @@ func RunnersDir() (p string) {
 
 	defer func() {
 		if p == "" {
-			slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
+			slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama/runners'")
 		}
 	}()
 
@@ -190,7 +190,7 @@ func RunnersDir() (p string) {
 	}
 
 	var paths []string
-	for _, root := range []string{filepath.Dir(exe), cwd} {
+	for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), ".."), cwd} {
 		paths = append(paths,
 			root,
 			filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
@@ -200,7 +200,7 @@ func RunnersDir() (p string) {
 
 	// Try a few variations to improve developer experience when building from source in the local tree
 	for _, path := range paths {
-		candidate := filepath.Join(path, "ollama_runners")
+		candidate := filepath.Join(path, "lib", "ollama", "runners")
 		if _, err := os.Stat(candidate); err == nil {
 			p = candidate
 			break
diff --git a/gpu/amd_common.go b/gpu/amd_common.go
index 05747208..72d204f7 100644
--- a/gpu/amd_common.go
+++ b/gpu/amd_common.go
@@ -54,7 +54,7 @@ func commonAMDValidateLibDir() (string, error) {
 	// Installer payload location if we're running the installed binary
 	exe, err := os.Executable()
 	if err == nil {
-		rocmTargetDir := filepath.Join(filepath.Dir(exe), "ollama_libs")
+		rocmTargetDir := filepath.Join(filepath.Dir(exe), "..", "lib", "ollama")
 		if rocmLibUsable(rocmTargetDir) {
 			slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
 			return rocmTargetDir, nil
diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go
index 5d25a966..a0ae7c96 100644
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -153,7 +153,7 @@ func AMDValidateLibDir() (string, error) {
 	// Installer payload (if we're running from some other location)
 	localAppData := os.Getenv("LOCALAPPDATA")
 	appDir := filepath.Join(localAppData, "Programs", "Ollama")
-	rocmTargetDir := filepath.Join(appDir, "ollama_libs")
+	rocmTargetDir := filepath.Join(appDir, "..", "lib", "ollama")
 	if rocmLibUsable(rocmTargetDir) {
 		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
 		return rocmTargetDir, nil
diff --git a/gpu/gpu.go b/gpu/gpu.go
index eb87807a..391c98a8 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -653,8 +653,8 @@ func GetDepDir() string {
 		slog.Warn("failed to lookup working directory", "error", err)
 	}
 	// Scan for any of our dependeices, and pick first match
-	for _, root := range []string{filepath.Dir(appExe), cwd} {
-		libDep := "ollama_libs"
+	for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), ".."), cwd} {
+		libDep := filepath.Join("lib", "ollama")
 		if _, err := os.Stat(filepath.Join(root, libDep)); err == nil {
 			return filepath.Join(root, libDep)
 		}
diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index dc9dda5a..aef03f9a 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -189,7 +189,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
     CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
     BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
     export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
-    CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/ollama_libs}"
+    CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
     build
     install
     echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
@@ -213,7 +213,7 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
     CC=icx
     CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
     BUILD_DIR="../build/linux/${ARCH}/oneapi"
-    ONEAPI_DIST_DIR="${DIST_BASE}/ollama_libs"
+    ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
     export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
     DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
     build
@@ -260,7 +260,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
         echo "Building custom ROCM GPU"
     fi
     BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
-    ROCM_DIST_DIR="${DIST_BASE}/ollama_libs"
+    ROCM_DIST_DIR="${DIST_BASE}/lib/ollama"
     # TODO figure out how to disable runpath (rpath)
     # export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work
     export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1
index 42708d3e..4d43c9e2 100644
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -35,7 +35,7 @@ function init_vars {
         )
     $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
     $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
-    $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_runners"
+    $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\runners"
     md "$script:DIST_BASE" -ea 0 > $null
     if ($env:CGO_CFLAGS -contains "-g") {
         $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
@@ -286,11 +286,11 @@ function build_cuda() {
         sign
         install
 
-        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" -ea 0 > $null
-        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
+        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
     } else {
         write-host "Skipping CUDA generation step"
     }
@@ -324,17 +324,17 @@ function build_oneapi() {
     sign
     install
 
-    md "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" -ea 0 > $null
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+    md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
   } else {
     Write-Host "Skipping oneAPI generation step"
   }
@@ -384,11 +384,11 @@ function build_rocm() {
         sign
         install
 
-        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\rocblas\library\" -ea 0 > $null
-        cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-        cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\" -ea 0 > $null
+        cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+        cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
         # amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
-        cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\rocblas\library\"
+        cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\"
     } else {
         write-host "Skipping ROCm generation step"
     }
diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1
index 50b60230..9cebf1f4 100644
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -122,8 +122,8 @@ function buildOllama() {
             /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe
         if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
     }
-    New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force
-    cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\
+    New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\bin\ -Force
+    cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\bin\
 }
 
 function buildApp() {
@@ -142,22 +142,22 @@ function buildApp() {
 function gatherDependencies() {
     write-host "Gathering runtime dependencies"
     cd "${script:SRC_DIR}"
-    md "${script:DEPS_DIR}\ollama_libs" -ea 0 > $null
+    md "${script:DEPS_DIR}\lib\ollama" -ea 0 > $null
 
     # TODO - this varies based on host build system and MSVC version - drive from dumpbin output
     # currently works for Win11 + MSVC 2019 + Cuda V11
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\ollama_libs\"
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_libs\"
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_libs\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\lib\ollama\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\lib\ollama\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\lib\ollama\"
     foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) {
-        cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\ollama_libs\"
+        cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\lib\ollama\"
     }
 
 
     cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
     if ("${env:KEY_CONTAINER}") {
         write-host "about to sign"
-        foreach ($file in (get-childitem "${script:DEPS_DIR}\ollama_libs\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
+        foreach ($file in (get-childitem "${script:DEPS_DIR}\lib\ollama\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
             write-host "signing $file"
             & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
                 /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file
diff --git a/scripts/install.sh b/scripts/install.sh
index f0439b00..a02a0675 100644
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -66,7 +66,7 @@ fi
 for BINDIR in /usr/local/bin /usr/bin /bin; do
     echo $PATH | grep -q $BINDIR && break || continue
 done
-OLLAMA_INSTALL_DIR=${OLLAMA_INSTALL_DIR:-${BINDIR}}
+OLLAMA_INSTALL_DIR=$(dirname ${BINDIR})
 
 status "Installing ollama to $OLLAMA_INSTALL_DIR"
 $SUDO install -o0 -g0 -m755 -d $BINDIR
@@ -77,18 +77,22 @@ if curl -I --silent --fail --location "https://ollama.com/download/ollama-linux-
         "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" | \
         $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
     BUNDLE=1
+    if [ "$OLLAMA_INSTALL_DIR/bin/ollama" != "$BINDIR/ollama" ] ; then
+        status "Making ollama accessible in the PATH in $BINDIR"
+        $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
+    fi
 else
     status "Downloading Linux ${ARCH} CLI"
     curl --fail --show-error --location --progress-bar -o "$TEMP_DIR/ollama"\
     "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}"
     $SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $OLLAMA_INSTALL_DIR/ollama
     BUNDLE=0
+    if [ "$OLLAMA_INSTALL_DIR/ollama" != "$BINDIR/ollama" ] ; then
+        status "Making ollama accessible in the PATH in $BINDIR"
+        $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
+    fi
 fi
 
-if [ "$OLLAMA_INSTALL_DIR/ollama" != "$BINDIR/ollama" ] ; then
-    status "Making ollama accessible in the PATH in $BINDIR"
-    $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
-fi
 
 install_success() {
     status 'The Ollama API is now available at 127.0.0.1:11434.'

From f9e31da9463092d7b3661594788c259d6d55b3d9 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Thu, 15 Aug 2024 14:38:14 -0700
Subject: [PATCH 62/71] Review comments

---
 .github/workflows/release.yaml | 106 ++++++---------------------------
 docs/linux.md                  |   8 +--
 gpu/cuda_common.go             |   2 +-
 gpu/gpu.go                     |  16 ++---
 llm/generate/gen_windows.ps1   |   4 +-
 5 files changed, 32 insertions(+), 104 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 4bd68455..508fbb35 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -183,10 +183,17 @@ jobs:
           name: windows-rocm-deps
           path: dist/deps/*
 
-  # CUDA v11 generation step
-  generate-windows-cuda-v11:
+  # CUDA generation step
+  generate-windows-cuda:
     environment: release
     runs-on: windows
+    strategy:
+      matrix:
+        cuda:
+          - version: "11"
+            url: 'https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe'
+          - version: "12"
+            url: 'https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe'
     env:
       KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
     steps:
@@ -220,11 +227,11 @@ jobs:
         with:
           go-version-file: go.mod
           cache: true
-      - name: 'Install CUDA'
+      - name: 'Install CUDA ${{ matrix.cuda.version }}'
         run: |
           $ErrorActionPreference = "Stop"
           write-host "downloading CUDA Installer"
-          Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
+          Invoke-WebRequest -Uri "${{ matrix.cuda.url }}" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
           write-host "Installing CUDA"
           Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
           write-host "Completed CUDA"
@@ -256,7 +263,7 @@ jobs:
           cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
       - uses: actions/upload-artifact@v4
         with:
-          name: generate-windows-cuda-v11
+          name: generate-windows-cuda-${{ matrix.cuda.version }}
           path: |
             llm/build/**/bin/*
             dist/windows-amd64/**
@@ -265,95 +272,13 @@ jobs:
           name: windows-cuda-deps
           path: dist/deps/*
 
-  # CUDA v12 generation step
-  generate-windows-cuda-v12:
-    environment: release
-    runs-on: windows
-    env:
-      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set Version
-        shell: bash
-        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - uses: 'google-github-actions/auth@v2'
-        with:
-          project_id: 'ollama'
-          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
-      - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
-      - name: install Windows SDK 8.1 to get signtool
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading SDK"
-          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
-          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
-          write-host "Win SDK 8.1 installed"
-          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
-      - name: install signing plugin
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading plugin"
-          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
-          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
-          write-host "Installing plugin"
-          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
-          write-host "plugin installed"
-      - uses: actions/setup-go@v5
-        with:
-          go-version-file: go.mod
-          cache: true
-      - name: 'Install CUDA'
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading CUDA Installer"
-          Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
-          write-host "Installing CUDA"
-          Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
-          write-host "Completed CUDA"
-          $cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
-          $cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2' 
-          echo "$cudaPath\bin" >> $env:GITHUB_PATH
-          echo "CUDA_PATH=$cudaPath" >> $env:GITHUB_ENV
-          echo "CUDA_PATH_V${cudaVer}=$cudaPath" >> $env:GITHUB_ENV
-          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV
-      - name: 'Verify CUDA'
-        run: nvcc -V
-      - run: go get ./...
-      - name: go generate
-        run: |
-          $gopath=(get-command go).source | split-path -parent
-          $cudabin=(get-command nvcc).source | split-path
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$cudabin;$env:PATH"
-          $env:OLLAMA_SKIP_CPU_GENERATE="1"
-          go generate -x ./...
-      - name: 'gather cuda dependencies'
-        run: |
-          $NVIDIA_DIR=(resolve-path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*\bin\')[0]
-          md "dist\deps"
-          cp "${NVIDIA_DIR}\cudart64_*.dll" "dist\deps\"
-          cp "${NVIDIA_DIR}\cublas64_*.dll" "dist\deps\"
-          cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
-      - uses: actions/upload-artifact@v4
-        with:
-          name: generate-windows-cuda-v12
-          path: |
-            llm/build/**/bin/*
-            dist/windows-amd64/**
-      - uses: actions/upload-artifact@v4
-        with:
-          name: windows-cuda-deps
-          path: dist/deps/*
 
   # Import the prior generation steps and build the final windows assets
   build-windows:
     environment: release
     runs-on: windows
     needs:
-      - generate-windows-cuda-v11
-      - generate-windows-cuda-v12
+      - generate-windows-cuda
       - generate-windows-rocm
       - generate-windows-cpu
     env:
@@ -397,7 +322,10 @@ jobs:
           name: generate-windows-cpu
       - uses: actions/download-artifact@v4
         with:
-          name: generate-windows-cuda-v11
+          name: generate-windows-cuda-11
+      - uses: actions/download-artifact@v4
+        with:
+          name: generate-windows-cuda-12
       - uses: actions/download-artifact@v4
         with:
           name: windows-cuda-deps
diff --git a/docs/linux.md b/docs/linux.md
index 3ed2bed0..d1d5892c 100644
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -20,12 +20,12 @@ GPU.
 
 ## Manual install
 
-### Download the `ollama` tar file
+### Download `ollama`
 
-Ollama is distributed as a tar file including GPU library dependencies.
+Download and extract the Linux package:
 
 ```bash
-curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar -C /usr -zxf -
+curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr
 ```
 
 ### Adding Ollama as a startup service (recommended)
@@ -95,7 +95,7 @@ curl -fsSL https://ollama.com/install.sh | sh
 Or by downloading the ollama binary:
 
 ```bash
-curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar -C /usr -zxf -
+curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr
 ```
 
 ## Installing specific versions
diff --git a/gpu/cuda_common.go b/gpu/cuda_common.go
index defaa60a..827cc9b4 100644
--- a/gpu/cuda_common.go
+++ b/gpu/cuda_common.go
@@ -28,7 +28,7 @@ func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
 	return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
 }
 
-func cudaGetVariant(gpuInfo CudaGPUInfo) string {
+func cudaVariant(gpuInfo CudaGPUInfo) string {
 	if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
 		if CudaTegra != "" {
 			ver := strings.Split(CudaTegra, ".")
diff --git a/gpu/gpu.go b/gpu/gpu.go
index 391c98a8..72d237a6 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -225,7 +225,7 @@ func GetGPUInfo() GpuInfoList {
 			return GpuInfoList{cpus[0].GpuInfo}
 		}
 
-		depPath := GetDepDir()
+		depPath := LibraryDir()
 
 		// Load ALL libraries
 		cHandles = initCudaHandles()
@@ -264,20 +264,20 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.computeMajor = int(memInfo.major)
 				gpuInfo.computeMinor = int(memInfo.minor)
 				gpuInfo.MinimumMemory = cudaMinimumMemory
-				cudaVariant := cudaGetVariant(gpuInfo)
+				variant := cudaVariant(gpuInfo)
 				if depPath != "" {
 					gpuInfo.DependencyPath = depPath
 					// Check for variant specific directory
-					if cudaVariant != "" {
-						if _, err := os.Stat(filepath.Join(depPath, "cuda_"+cudaVariant)); err == nil {
-							gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+cudaVariant)
+					if variant != "" {
+						if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil {
+							gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+variant)
 						}
 					}
 				}
 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
 				gpuInfo.DriverMajor = driverMajor
 				gpuInfo.DriverMinor = driverMinor
-				gpuInfo.Variant = cudaGetVariant(gpuInfo)
+				gpuInfo.Variant = variant
 
 				// query the management library as well so we can record any skew between the two
 				// which represents overhead on the GPU we must set aside on subsequent updates
@@ -468,7 +468,7 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 	slog.Debug("Searching for GPU library", "name", baseLibName)
 
 	// Start with our bundled libraries
-	patterns := []string{filepath.Join(GetDepDir(), baseLibName)}
+	patterns := []string{filepath.Join(LibraryDir(), baseLibName)}
 
 	switch runtime.GOOS {
 	case "windows":
@@ -642,7 +642,7 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 	}
 }
 
-func GetDepDir() string {
+func LibraryDir() string {
 	// On Windows/linux we bundle the dependencies at the same level as the executable
 	appExe, err := os.Executable()
 	if err != nil {
diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1
index 4d43c9e2..cbdfd09f 100644
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -117,7 +117,7 @@ function build {
     if ($cmakeDefs -contains "-G") {
         $extra=@("-j8")
     } else {
-        $extra= @("--", "/p:CL_MPcount=8")
+        $extra= @("--", "/maxCpuCount:8")
     }
     write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
     & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
@@ -273,7 +273,7 @@ function build_cuda() {
             "-DGGML_CUDA=ON",
             "-DGGML_AVX=on",
             "-DGGML_AVX2=off",
-            "-DCMAKE_CUDA_FLAGS=-t8",
+            "-DCMAKE_CUDA_FLAGS=-t6",
             "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}",
             "-DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=$env:CUDA_PATH"
             )

From d8be22e47d460d1483846e2effb9b67fbfce1c0b Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Mon, 19 Aug 2024 12:07:18 -0700
Subject: [PATCH 63/71] Fix overlapping artifact name on CI

---
 .github/workflows/release.yaml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 508fbb35..f6489dac 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -269,7 +269,7 @@ jobs:
             dist/windows-amd64/**
       - uses: actions/upload-artifact@v4
         with:
-          name: windows-cuda-deps
+          name: windows-cuda-deps-${{ matrix.cuda.version }}
           path: dist/deps/*
 
 
@@ -328,7 +328,10 @@ jobs:
           name: generate-windows-cuda-12
       - uses: actions/download-artifact@v4
         with:
-          name: windows-cuda-deps
+          name: windows-cuda-deps-11
+      - uses: actions/download-artifact@v4
+        with:
+          name: windows-cuda-deps-12
       - uses: actions/download-artifact@v4
         with:
           name: windows-rocm-deps

From f91c9e370923d3b10a88732ab577e2728022152d Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Mon, 19 Aug 2024 13:48:45 -0700
Subject: [PATCH 64/71] CI: handle directories during checksum (#6427)

---
 .github/workflows/release.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index f6489dac..aad49d98 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -472,7 +472,8 @@ jobs:
           merge-multiple: true
       - run: |
           ls -lh dist/
-          (cd dist; sha256sum * > sha256sum.txt)
+          (cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt)
+          mv sha256sum.txt dist/
           cat dist/sha256sum.txt
       - name: Create or update Release
         run: |

From 19e5a890f70b95a55c9de6a55357d78fc0a4ff81 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Mon, 19 Aug 2024 15:19:21 -0700
Subject: [PATCH 65/71] CI: remove directories from dist dir before upload step
 (#6429)

---
 .github/workflows/release.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index aad49d98..2cf4d2c2 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -474,6 +474,7 @@ jobs:
           ls -lh dist/
           (cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt)
           mv sha256sum.txt dist/
+          mv dist/linux-???64 .
           cat dist/sha256sum.txt
       - name: Create or update Release
         run: |

From a017cf2fea4aaa376087520382058c42cffce097 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Tue, 20 Aug 2024 07:26:38 -0700
Subject: [PATCH 66/71] Split rocm back out of bundle (#6432)

We're over budget for github's maximum release artifact size with rocm + 2 cuda
versions.  This splits rocm back out as a discrete artifact, but keeps the layout so it can
be extracted into the same location as the main bundle.
---
 .github/workflows/release.yaml | 1 +
 Dockerfile                     | 4 ++--
 llm/generate/gen_linux.sh      | 3 ++-
 scripts/build_linux.sh         | 6 ++++++
 scripts/install.sh             | 5 +++++
 5 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 2cf4d2c2..9c1e3e13 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -475,6 +475,7 @@ jobs:
           (cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt)
           mv sha256sum.txt dist/
           mv dist/linux-???64 .
+          mv dist/linux-amd64-rocm .
           cat dist/sha256sum.txt
       - name: Create or update Release
         run: |
diff --git a/Dockerfile b/Dockerfile
index d4b86918..c46477b4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -95,8 +95,8 @@ ARG AMDGPU_TARGETS
 ENV GOARCH amd64 
 RUN --mount=type=cache,target=/root/.ccache \
     OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
-RUN mkdir -p ../../dist/linux-amd64/lib/ollama && \
-    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64/lib/ollama && tar xf - )
+RUN mkdir -p ../../dist/linux-amd64-rocm/lib/ollama && \
+    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64-rocm/lib/ollama && tar xf - )
 
 FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
 ARG CMAKE_VERSION
diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index aef03f9a..6927dda8 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -260,7 +260,8 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
         echo "Building custom ROCM GPU"
     fi
     BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
-    ROCM_DIST_DIR="${DIST_BASE}/lib/ollama"
+    # ROCm dependencies are too large to fit into a unified bundle
+    ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
     # TODO figure out how to disable runpath (rpath)
     # export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work
     export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh
index adda2ad7..6cb0d0cd 100755
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -24,8 +24,14 @@ for TARGETARCH in ${BUILD_ARCH}; do
     docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
     rm -rf ./dist/linux-$TARGETARCH
     docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist
+    if echo ${TARGETARCH} | grep "amd64" > /dev/null; then
+        docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH-rocm ./dist
+    fi
     docker rm builder-$TARGETARCH
     echo "Compressing final linux bundle..."
     rm -f ./dist/ollama-linux-$TARGETARCH.tgz
     (cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz )
+    if [ -d dist/linux-$TARGETARCH-rocm ]; then
+        (cd dist/linux-$TARGETARCH-rocm && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH-rocm.tgz )
+    fi
 done
diff --git a/scripts/install.sh b/scripts/install.sh
index a02a0675..25f57565 100644
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -199,6 +199,11 @@ fi
 
 if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then
     if [ $BUNDLE -ne 0 ]; then
+        status "Downloading Linux ROCm ${ARCH} bundle"
+        curl --fail --show-error --location --progress-bar \
+            "https://ollama.com/download/ollama-linux-${ARCH}-rocm.tgz${VER_PARAM}" | \
+            $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
+
         install_success
         status "AMD GPU ready."
         exit 0

From 5a28b9cf5fcb3994aa1a143118c73c7d1fbf3bf9 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Thu, 6 Jun 2024 08:59:04 -0700
Subject: [PATCH 67/71] bert

---
 convert/convert.go                     |  12 ++
 convert/convert_bert.go                | 176 +++++++++++++++++++++++++
 convert/convert_test.go                |   1 +
 convert/reader.go                      |   2 +
 convert/testdata/all-MiniLM-L6-v2.json | 124 +++++++++++++++++
 convert/tokenizer.go                   |  31 ++---
 6 files changed, 331 insertions(+), 15 deletions(-)
 create mode 100644 convert/convert_bert.go
 create mode 100644 convert/testdata/all-MiniLM-L6-v2.json

diff --git a/convert/convert.go b/convert/convert.go
index 24c19aa4..f51e9665 100644
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -66,6 +66,10 @@ type Converter interface {
 	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
 }
 
+type moreParser interface {
+	parseMore(fs.FS) error
+}
+
 // Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
 // and files it finds in the input path.
 // Supported input model formats include safetensors.
@@ -95,6 +99,8 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
 		conv = &gemma{}
 	case "Phi3ForCausalLM":
 		conv = &phi3{}
+	case "BertModel":
+		conv = &bert{}
 	default:
 		return errors.New("unsupported architecture")
 	}
@@ -103,6 +109,12 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
 		return err
 	}
 
+	if t, ok := conv.(moreParser); ok {
+		if err := t.parseMore(fsys); err != nil {
+			return err
+		}
+	}
+
 	t, err := parseTokenizer(fsys, conv.specialTokenTypes())
 	if err != nil {
 		return err
diff --git a/convert/convert_bert.go b/convert/convert_bert.go
new file mode 100644
index 00000000..62fad147
--- /dev/null
+++ b/convert/convert_bert.go
@@ -0,0 +1,176 @@
+package convert
+
+import (
+	"cmp"
+	"encoding/json"
+	"io/fs"
+	"path/filepath"
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/llm"
+)
+
+type bert struct {
+	Parameters
+	NLayers               uint32  `json:"n_layers"`
+	NumHiddenLayers       uint32  `json:"num_hidden_layers"`
+	NLayer                uint32  `json:"n_layer"`
+	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
+	NCtx                  uint32  `json:"n_ctx"`
+	HiddenSize            uint32  `json:"hidden_size"`
+	NEmbd                 uint32  `json:"n_embd"`
+	IntermediateSize      uint32  `json:"intermediate_size"`
+	NInner                uint32  `json:"n_inner"`
+	NumAttentionHeads     uint32  `json:"num_attention_heads"`
+	NHead                 uint32  `json:"n_head"`
+	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
+	LayerNormEPS          float32 `json:"layer_norm_eps"`
+	LayerNormEpsilon      float32 `json:"layer_norm_epsilon"`
+	NormEpsilon           float32 `json:"norm_epsilon"`
+
+	PoolingType uint32
+}
+
+var (
+	_ Converter  = (*bert)(nil)
+	_ moreParser = (*bert)(nil)
+)
+
+func (p *bert) parseMore(fsys fs.FS) error {
+	bts, err := fs.ReadFile(fsys, "modules.json")
+	if err != nil {
+		return err
+	}
+
+	var modules []struct {
+		Type string `json:"type"`
+		Path string `json:"path"`
+	}
+
+	if err := json.Unmarshal(bts, &modules); err != nil {
+		return err
+	}
+
+	var pooling string
+	for _, m := range modules {
+		if m.Type == "sentence_transformers.models.Pooling" {
+			pooling = m.Path
+			break
+		}
+	}
+
+	if pooling != "" {
+		bts, err := fs.ReadFile(fsys, filepath.Join(pooling, "config.json"))
+		if err != nil {
+			return err
+		}
+
+		var pc struct {
+			PoolingModeCLSToken   bool `json:"pooling_mode_cls_token"`
+			PoolingModeMeanTokens bool `json:"pooling_mode_mean_tokens"`
+		}
+
+		if err := json.Unmarshal(bts, &pc); err != nil {
+			return err
+		}
+
+		if pc.PoolingModeMeanTokens {
+			p.PoolingType = 1
+		} else if pc.PoolingModeCLSToken {
+			p.PoolingType = 2
+		}
+	}
+
+	return nil
+}
+
+func (p *bert) KV(t *Tokenizer) llm.KV {
+	kv := p.Parameters.KV(t)
+	kv["general.architecture"] = "bert"
+	kv["general.name"] = "bert"
+	kv["bert.attention.causal"] = false
+	kv["bert.pooling_type"] = p.PoolingType
+
+	kv["bert.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
+
+	if contextLength := cmp.Or(p.MaxPositionEmbeddings, p.NCtx); contextLength > 0 {
+		kv["bert.context_length"] = contextLength
+	}
+
+	if embeddingLength := cmp.Or(p.HiddenSize, p.NEmbd); embeddingLength > 0 {
+		kv["bert.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
+	}
+
+	if feedForwardLength := cmp.Or(p.IntermediateSize, p.NInner); feedForwardLength > 0 {
+		kv["bert.feed_forward_length"] = cmp.Or(p.IntermediateSize, p.NInner)
+	}
+
+	if headCount := cmp.Or(p.NumAttentionHeads, p.NHead); headCount > 0 {
+		kv["bert.attention.head_count"] = cmp.Or(p.NumAttentionHeads, p.NHead)
+	}
+
+	if layerNormEpsilon := cmp.Or(p.LayerNormEPS, p.LayerNormEpsilon, p.NormEpsilon); layerNormEpsilon > 0 {
+		kv["bert.attention.layer_norm_epsilon"] = layerNormEpsilon
+	}
+
+	kv["tokenizer.ggml.model"] = "bert"
+	kv["tokenizer.ggml.token_type_count"] = uint32(2)
+
+	// convert to phantom space tokens
+	for i, e := range t.Tokens {
+		if strings.HasPrefix(e, "[") && strings.HasSuffix(e, "]") {
+			// noop
+		} else if strings.HasPrefix(e, "##") {
+			t.Tokens[i] = e[2:]
+		} else {
+			t.Tokens[i] = "\u2581" + e
+		}
+	}
+
+	kv["tokenizer.ggml.tokens"] = t.Tokens
+
+	return kv
+}
+
+func (p *bert) Tensors(ts []Tensor) []llm.Tensor {
+	var out []llm.Tensor
+	for _, t := range ts {
+		if slices.Contains([]string{
+			"embeddings.position_ids",
+			"pooler.dense.weight",
+			"pooler.dense.bias",
+		}, t.Name()) {
+			continue
+		}
+
+		name := p.tensorName(t.Name())
+		out = append(out, llm.Tensor{
+			Name:     name,
+			Kind:     t.Kind(),
+			Shape:    t.Shape(),
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (bert) tensorName(n string) string {
+	return strings.NewReplacer(
+		"encoder.layer", "blk",
+		"encoder.layers", "blk",
+		"embeddings.word_embeddings", "token_embd",
+		"embeddings.token_type_embeddings", "token_types",
+		"embeddings.LayerNorm", "token_embd_norm",
+		"embeddings.position_embeddings", "position_embd",
+		"attention.self.query", "attn_q",
+		"attention.self.key", "attn_k",
+		"attention.self.value", "attn_v",
+		"attention.output.dense", "attn_output",
+		"attention.output.LayerNorm", "attn_output_norm",
+		"intermediate.dense", "ffn_up",
+		"output.dense", "ffn_down",
+		"output.LayerNorm", "layer_output_norm",
+	).Replace(n)
+}
diff --git a/convert/convert_test.go b/convert/convert_test.go
index cb2c585e..e3ab0098 100644
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -67,6 +67,7 @@ func TestConvertFull(t *testing.T) {
 		"gemma-2b-it",
 		// microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8
 		"Phi-3-mini-128k-instruct",
+		"all-MiniLM-L6-v2",
 	}
 
 	for i := range cases {
diff --git a/convert/reader.go b/convert/reader.go
index ce95208e..294a7c40 100644
--- a/convert/reader.go
+++ b/convert/reader.go
@@ -37,6 +37,8 @@ const (
 func (t tensorBase) Kind() uint32 {
 	if strings.HasSuffix(t.name, ".block_sparse_moe.gate.weight") {
 		return 0
+	} else if t.name == "embeddings.token_type_embeddings.weight" {
+		return 0
 	}
 
 	switch len(t.shape) {
diff --git a/convert/testdata/all-MiniLM-L6-v2.json b/convert/testdata/all-MiniLM-L6-v2.json
new file mode 100644
index 00000000..15c8f039
--- /dev/null
+++ b/convert/testdata/all-MiniLM-L6-v2.json
@@ -0,0 +1,124 @@
+{
+  "general.architecture": "bert",
+  "general.file_type": "1",
+  "general.quantization_version": "2",
+  "bert.attention.causal": "false",
+  "bert.attention.head_count": "12",
+  "bert.attention.layer_norm_epsilon": "1e-12",
+  "bert.block_count": "6",
+  "bert.context_length": "512",
+  "bert.embedding_length": "384",
+  "bert.feed_forward_length": "1536",
+  "bert.pooling_type": "1",
+  "tokenizer.ggml.model": "bert",
+  "tokenizer.ggml.padding_token_id": "0",
+  "tokenizer.ggml.unknown_token_id": "100",
+  "tokenizer.ggml.cls_token_id": "101",
+  "tokenizer.ggml.seperator_token_id": "102",
+  "tokenizer.ggml.mask_token_id": "103",
+  "tokenizer.ggml.token_type_count": "2",
+  "tokenizer.ggml.scores": "6db964fe67338aca57790481a390121ff3dd643eebe49f7dd308029ad99abb6f",
+  "tokenizer.ggml.token_type": "98d247c5404b6b18f05f133b92dd56edf6efefefac326794b00d7b351f6c5aa1",
+  "tokenizer.ggml.tokens": "9efe405e229a45ff9916f54c475d151d2200cd2ab0006f347abfb069cf096c86",
+  "token_embd.weight": "8c1ee80a9ea4f65aa385ba30112010068af3d209bebc6e149d3d4589c2cd0a5a",
+  "position_embd.weight": "6c516f0b1c4e2388ab90394dd80ad69e4e4509b890982fc3408108ae66210eb6",
+  "token_types.weight": "f879f8e422ed211948f28b560d3c5e17aae7993f063b51196a28cf5c0fb3da21",
+  "token_embd_norm.weight": "75076e095d717aab96f8b6beeee503c27940d9a76f2b891a0e3de72f8a6043e4",
+  "token_embd_norm.bias": "298735285ffe944e1bf03e5d35c7280326b85cf121bde9874f1af5dc51ab939d",
+  "blk.0.attn_q.weight": "ab0923ce4c1549175112dcdfcc860fe30137f991e03ea6857fb5993670adaf6c",
+  "blk.0.attn_q.bias": "a3ec29551dabf976e1d34256b8ab5ab7b758f3ed9742c3cafdbd984d5441df62",
+  "blk.0.attn_k.weight": "4c1038a6d035c3e9ffed7fa672b614627814752503755fbad0cfb76a41ad71ba",
+  "blk.0.attn_k.bias": "e0363930eb588d91816aa3d230bb03b6e2551c165117b80b8d60397413819ef9",
+  "blk.0.attn_v.weight": "425e2e53e3f00ce98d29c3e6a161eb55d3e6ae0d96fdb9f6242d1c4fd6eef4b3",
+  "blk.0.attn_v.bias": "6579173a1e65ee124fbd0bd53cbdca4225515b4f2c5f18fb1bfd000f5978f9bb",
+  "blk.0.attn_output.weight": "a6d70a08cd7164de5d12af65d86d657c3db35aaecde778b2b3fda9193c4c9802",
+  "blk.0.attn_output.bias": "2b8d12c4f9a9c5bfaa29c597839568f6e0525cb41eeaf64ddeb6bd84dfeb9701",
+  "blk.0.attn_output_norm.weight": "bbe6e502a473228b525aeed26cc31b7db123ad63bdc5a6eebac6ea70b8b51d62",
+  "blk.0.attn_output_norm.bias": "36eaacaf0007c5c62daea97aab0115390c0682914f78482e37eb76885f4b7a50",
+  "blk.0.ffn_up.weight": "24654561c76ce387d125759ba843f06b904ef721fcceaeff6ccc62180a48e874",
+  "blk.0.ffn_up.bias": "fd3f0126aa1d95768fa60eb6f4ab8a2763cfcb7e5405f35b92353031d86f4d34",
+  "blk.0.ffn_down.weight": "97a829763a6a5bf3329ceb4d39c424ba4787d61653a5b0bbd1f84782e4d4e0ca",
+  "blk.0.ffn_down.bias": "7aa980c30ae8b4ee7f69df28808dbf5c431f56ccc4a80340f644a0419f16c054",
+  "blk.0.layer_output_norm.weight": "ef30dad4c2a083ae1ff5039a2a6cda60ecc89bf1e486a6f8c0d15f50589603f8",
+  "blk.0.layer_output_norm.bias": "8b1b77e67568b1bce43fc476de1b177c53ff688d66beb66995e8eb3dc290da8a",
+  "blk.1.attn_q.weight": "284331622a1f6f9b87ccee4f652bd66a394ca493c4d93be4d1844e4f6159ad10",
+  "blk.1.attn_q.bias": "e24ebd4860330e08f6bfdd077a82db0bee33f4c8846cf1db26327a34754c7069",
+  "blk.1.attn_k.weight": "729dd0d555544b5bd0f7580b3c8b384256b974605f0e7487b95f295aa032997d",
+  "blk.1.attn_k.bias": "2aa51a828a858f35473f54477583fea54ce2ccc34ea60fbd1d228fbe9bca827f",
+  "blk.1.attn_v.weight": "6be304671cc311d5ca5c103f2b51467ee800c589bc5b8101e09ff5aed1f68c21",
+  "blk.1.attn_v.bias": "43bcbab78a8819e07f723bc9e5b737b71e87a7594f15234e882b63e327a64199",
+  "blk.1.attn_output.weight": "15ec8a1a12b26c9976445308a09f748ab0e4bef0f583d13ab08c3129f8738d73",
+  "blk.1.attn_output.bias": "dac2146f4baa6ed16f6c0dc7443831fb7ec79bedcceafd80d1a4b628a1bb072d",
+  "blk.1.attn_output_norm.weight": "d2151eb33bffac536787a4c9a5d2b31c7a80b17c4611877842a3cce2cd6e98d8",
+  "blk.1.attn_output_norm.bias": "31e1b779716dafb855d2cf5631ee168a0ccf372eb9c6ea6091f66fa97a9b9d2d",
+  "blk.1.ffn_up.weight": "a57547fc3fc3b77406f5cdcb0c87af9bc184701f175c39c1f35297826fce3cc7",
+  "blk.1.ffn_up.bias": "123be6d541d086202913c75d878c54d59a749f3af7b58f7ef9eb9e7c62a24c9a",
+  "blk.1.ffn_down.weight": "cfdb79788377e5cbded8790cd41b9e66c397ecab75474071fcd7cf32d30f9613",
+  "blk.1.ffn_down.bias": "bcb58315519a573097960891c9ae41cf4c685ab78c3e0e77471471758a7eae88",
+  "blk.1.layer_output_norm.weight": "819b554271452bfb1d84c2603b90377b2e41a0ac1e3aa8b417ccf9dce63375bd",
+  "blk.1.layer_output_norm.bias": "47a3433ac27f5ce8947fb38dd491f3706df4ef6adb0ddf74612bf0f54b19e164",
+  "blk.2.attn_q.weight": "1557a9ea852b1880551f7290e00aded4f35e6c4180fdcbed1b0039bf805f639e",
+  "blk.2.attn_q.bias": "c3bfe5f3066f655fd36b055530997b59ff33ef013563aaeb3cb8ff07dabd59a9",
+  "blk.2.attn_k.weight": "cfd08eb69c61ae2f9f14f9b7ff5c5394ca264b1a9f3d48156677f90dd1766289",
+  "blk.2.attn_k.bias": "9b839bc0e79974a0b3f5d1895972bc6f5c9a1bc16052e1af786e6a530758152d",
+  "blk.2.attn_v.weight": "02b26b1208480eaeeb00e7b4cf8b690006ca14759357fc44ed4a2a8924ead993",
+  "blk.2.attn_v.bias": "e7e6f0089fded1659a867ab736c220d9653ea7da6b1b94baf5c8d30a748b63ab",
+  "blk.2.attn_output.weight": "a1db121c7d33806b349cadd050300a57db49fdc91224fd07c9ac43bf4299dc79",
+  "blk.2.attn_output.bias": "7675128b6a92555cd955c820311e91e9417d31f48848f45d047b4100c62148b3",
+  "blk.2.attn_output_norm.weight": "5b4595e0fbcba67a700c4331adf746d2fba3546364a4db5607ae241947bb1a21",
+  "blk.2.attn_output_norm.bias": "7b8e16826ea30e5a2ba0b02e0095a901775981a296e98819625320e983060d08",
+  "blk.2.ffn_up.weight": "a0d815d946ac07a65095c4ae4df77b818845e6d97795c7d82f55e689d944db59",
+  "blk.2.ffn_up.bias": "ce37c0a4174d6bf773ded7bd016ede627ad3bdb8bc99b9992a18dc8e8898f252",
+  "blk.2.ffn_down.weight": "f6231d2a25426fbd45b9f1160aa484220eb227ceef0348c4a6a6de890606e5ef",
+  "blk.2.ffn_down.bias": "429e00556e8dc63a785238b309b9d83738500c1ef6d736fe6526ad88ea496d27",
+  "blk.2.layer_output_norm.weight": "651457a573adf3f7dd9ee5dfe1c8e89389e94443993aab77ec6a0b05aa621e35",
+  "blk.2.layer_output_norm.bias": "41fbbeda7fd89b0cef5f945ae44011c316982390401d6f75ba8c6d365e185247",
+  "blk.3.attn_q.weight": "95a43f32949d2cb8d22815bb27a44abfc6665ba96221af817dfe058cb6ca72c6",
+  "blk.3.attn_q.bias": "f4e34385e75d8108b6b3bd336106e2133a8c9be0cc343dfe5dc48c32a823c7cb",
+  "blk.3.attn_k.weight": "6b892da6a17d4d3265265a15f695864a31813ee8c8e710ae9bc9e1adbc6c9a18",
+  "blk.3.attn_k.bias": "40b8067b641a56014cee42548240aa8930820958b1933004892b5f04fbaef39e",
+  "blk.3.attn_v.weight": "9fcd5922319dd2a461082a5ce040c1dfe65d87d70ca6547dd0b46eeecc3eeb2b",
+  "blk.3.attn_v.bias": "b528c56212e66931fdbe267ac327a9c2f87cd03baff3ea719e30afe681da15f1",
+  "blk.3.attn_output.weight": "e3b178c1b03981e75510e0d277af23ea59cc404b5394e61bd32291825719b502",
+  "blk.3.attn_output.bias": "712c84d39a6a5a9c06a09da8fd9939ba0d5525524a4bba61ea4de09b48f45cae",
+  "blk.3.attn_output_norm.weight": "d1ffac88e675592ff72f8a617be32b4a381d443b2f8f2645dbe44a1e5745aac0",
+  "blk.3.attn_output_norm.bias": "ea31a1c73146234c50e0e43f485c458413714867b8e2703af66482f7db2d6c40",
+  "blk.3.ffn_up.weight": "4ef4f3b9a1ea6ab2ef2eb6e8b008e06a44790d099d97482a05a51e39a29afac0",
+  "blk.3.ffn_up.bias": "06a4296dda16f452675c51f108079fe7722552d6521c737d97734943818b9a2b",
+  "blk.3.ffn_down.weight": "f114b2bebe392c7d80433bb880c6730293aa4561b0b0370dcdaf7472daebd847",
+  "blk.3.ffn_down.bias": "2c8e67831d28a3bf613fc7912ae3259b63d72abcaf4d30efd8800758400158de",
+  "blk.3.layer_output_norm.weight": "a1dfeb7b5a51dd56447312ca41e2ad2f361a3ea12ddc355127f5f4219fb0a482",
+  "blk.3.layer_output_norm.bias": "1ed630021b25c6c6fc93fd32988b9907df966d4982a93081f639aac3044618ab",
+  "blk.4.attn_q.weight": "b5fae4c1f9a5f33a2a2e816ac0c01c25f422e4efdd59ef1ed93da2610e5370fc",
+  "blk.4.attn_q.bias": "c2e376524ea98ac3b10d9eee19ecb1b1e261fa5149efe0232844c923dfb428fb",
+  "blk.4.attn_k.weight": "a4632f5ebf9321d9d08f9112a4e5dda2efe5671df4a4e67fee24845f5b14af16",
+  "blk.4.attn_k.bias": "a9a02ffb8b8b4f6dfe487a7e0341f1d5318c9d2b793a688f34cb1b22fc66ef60",
+  "blk.4.attn_v.weight": "10ad8deb81d9fa093b1e5c0f24ea82aa7df43e6aca49e260fcbea56eab8cc86a",
+  "blk.4.attn_v.bias": "7326813e181e021130bd33ac136293fcffccce2d1d8cb59041e5b13a8cceacf6",
+  "blk.4.attn_output.weight": "c92573088c7437c2b3cda51490e152c27fb19e5468df591eabba5a49d5398d44",
+  "blk.4.attn_output.bias": "14e10b419e5859af1eb685af5c330aee67048cd704dcead9217840c6f5393222",
+  "blk.4.attn_output_norm.weight": "02b6831c0e0fb0edbc579a92812a1dd972cb15d14fcd382d4427c5a7b300ac44",
+  "blk.4.attn_output_norm.bias": "7eed5cd503bb6bb6ceb1bc8b07cc077903a4f14fb8b9d6cdf39644815ecf1374",
+  "blk.4.ffn_up.weight": "8d0c91d62e74d6431321116a37cf3339e630bd50ba164d3304fc4fe8dd831223",
+  "blk.4.ffn_up.bias": "d325f07f73c005a273c484c7be8e7abb4d6e8a5c4fd093f5869133b97629d017",
+  "blk.4.ffn_down.weight": "7ba7bd81143f40537b84f938e403e19f30e4928625eb371de052b9025beb4d21",
+  "blk.4.ffn_down.bias": "2853d9c2a75288214a4bf4907dc19d04d01926f4913d302b1aa7bdbfcce0f7a1",
+  "blk.4.layer_output_norm.weight": "a4ed1885fa77b90fed5300c355ef0aa0c876a8c747151d9d790939d464d57d4f",
+  "blk.4.layer_output_norm.bias": "62142a81e813a9e636333b2b805d6bc3b17c5e7cd4b15adce1ada6bc9a32563c",
+  "blk.5.attn_q.weight": "afc1dff080a72c3daad01384b1448d476aaf789871017c8ff8e144788887995d",
+  "blk.5.attn_q.bias": "748a820371c1d4f872c84545b36358d239c35bf6c99e2812c237d88c3292763b",
+  "blk.5.attn_k.weight": "59e30c1ed8acd2cbb01de5f62e7804015b9ecf98ba157d98cab016344639eda5",
+  "blk.5.attn_k.bias": "f839520078f9e589496e982e86d0126c7aa14196047339abffcf49a696229f77",
+  "blk.5.attn_v.weight": "3e21fb874e21b90308e1f46af034a3c32d3eba1628d62ae5f2246d6af5818923",
+  "blk.5.attn_v.bias": "5cd4852bf95c1444d10d756750f6bf49f842c0b39e9953c7f408bb67c325ac8c",
+  "blk.5.attn_output.weight": "636ce6a7752895f204b9d01ba0aedd9a294f908b42f372c22a16d9dd590d7471",
+  "blk.5.attn_output.bias": "82d924d4b0d2b94f2bbff91619216d6967a3541ce9b1531a6a60457a67b5d219",
+  "blk.5.attn_output_norm.weight": "5e7bd0a8d3396080f3360d7c4700bf094a06216431bd014c4479eef72ecf4271",
+  "blk.5.attn_output_norm.bias": "66c6de5edda5466d029c6753780be81ccd4218bf8bc00680000e0f06856ab712",
+  "blk.5.ffn_up.weight": "5bbf6e7ea380e216e33f8bee06d25f2265359d3876a300e92bc6e41d48e33430",
+  "blk.5.ffn_up.bias": "9d795388bb36fb33ad3a37fea3ccb4937838e02800a608fb47d363cd06b47370",
+  "blk.5.ffn_down.weight": "2fd628974e7f075479dd227b46fbd48ae8d3ca34d735b36f391ac06410730368",
+  "blk.5.ffn_down.bias": "cd213ba9eaa75fa541648097fbe9c96e58077e6c3ad6ad2fb1f21f8350f44291",
+  "blk.5.layer_output_norm.weight": "159a9df41d15b7022d136f86a2a2631c4635f9816e957472217077b522bcf52a",
+  "blk.5.layer_output_norm.bias": "24c1f27ffd1eb4e5be7e3a2909943e6f0980635d761fa1efdd0c19645da23766"
+}
diff --git a/convert/tokenizer.go b/convert/tokenizer.go
index 0d42a6d8..653df6d2 100644
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -1,7 +1,6 @@
 package convert
 
 import (
-	"cmp"
 	"crypto/sha256"
 	"encoding/hex"
 	"encoding/json"
@@ -11,6 +10,8 @@ import (
 	"log/slog"
 	"os"
 	"slices"
+
+	"golang.org/x/exp/maps"
 )
 
 const (
@@ -184,32 +185,32 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) {
 		return nil, err
 	}
 
-	var tokens []token
+	tokens := make(map[int]token, len(t.Model.Vocab))
 	for k, v := range t.Model.Vocab {
-		tokens = append(tokens, token{
+		tokens[v] = token{
 			ID:      v,
 			Content: k,
-		})
+		}
 	}
 
-	for _, t := range t.AddedTokens {
-		t.UserDefined = true
-		tokens = append(tokens, t)
+	for _, token := range t.AddedTokens {
+		token.UserDefined = true
+		tokens[token.ID] = token
 	}
 
-	slices.SortFunc(tokens, func(i, j token) int {
-		return cmp.Compare(i.ID, j.ID)
-	})
+	keys := maps.Keys(tokens)
+	slices.Sort(keys)
 
 	v := Vocabulary{Model: "gpt2"}
-	for _, t := range tokens {
-		v.Tokens = append(v.Tokens, t.Content)
-		v.Scores = append(v.Scores, float32(t.ID))
+	for _, k := range keys {
+		token := tokens[k]
+		v.Tokens = append(v.Tokens, token.Content)
+		v.Scores = append(v.Scores, float32(token.ID))
 
 		switch {
-		case t.Special:
+		case token.Special:
 			v.Types = append(v.Types, tokenTypeControl)
-		case t.UserDefined:
+		case token.UserDefined:
 			v.Types = append(v.Types, tokenTypeUserDefined)
 		default:
 			v.Types = append(v.Types, tokenTypeNormal)

From beb49eef65acefc64a6ae0562ce58467e6974fde Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Fri, 7 Jun 2024 14:55:56 -0700
Subject: [PATCH 68/71] create bert models from cli

---
 cmd/cmd.go | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index fd7246c8..a8a02605 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -223,6 +223,14 @@ func tempZipFiles(path string) (string, error) {
 	}
 	files = append(files, js...)
 
+	// bert models require a nested config.json
+	// TODO(mxyng): merge this with the glob above
+	js, err = glob(filepath.Join(path, "**/*.json"), "text/plain")
+	if err != nil {
+		return "", err
+	}
+	files = append(files, js...)
+
 	if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
 		// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
 		// tokenizer.model might be a unresolved git lfs reference; error if it is
@@ -252,6 +260,11 @@ func tempZipFiles(path string) (string, error) {
 			return "", err
 		}
 
+		zfi.Name, err = filepath.Rel(path, file)
+		if err != nil {
+			return "", err
+		}
+
 		zf, err := zipfile.CreateHeader(zfi)
 		if err != nil {
 			return "", err

From 3546bbd08c52df73eb6523b06b13f1b2dfeaa5fb Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Fri, 28 Jun 2024 13:27:05 -0700
Subject: [PATCH 69/71] convert gemma2

---
 convert/convert.go                  | 11 ++++++--
 convert/convert_bert.go             |  9 +++---
 convert/convert_gemma.go            | 14 ++++-----
 convert/convert_gemma2.go           | 44 +++++++++++++++++++++++++++++
 convert/convert_llama.go            | 19 ++++++-------
 convert/convert_mixtral.go          |  9 ++++--
 convert/convert_phi3.go             | 11 ++++----
 convert/convert_test.go             |  1 +
 convert/reader.go                   | 12 ++++----
 convert/reader_safetensors.go       |  5 ++--
 convert/reader_torch.go             |  5 ++--
 convert/testdata/gemma-2-9b-it.json |  6 ++++
 convert/tokenizer_spm.go            | 32 ++++++++++++++++++++-
 13 files changed, 132 insertions(+), 46 deletions(-)
 create mode 100644 convert/convert_gemma2.go
 create mode 100644 convert/testdata/gemma-2-9b-it.json

diff --git a/convert/convert.go b/convert/convert.go
index f51e9665..5a314cdd 100644
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -7,6 +7,7 @@ import (
 	"io"
 	"io/fs"
 	"log/slog"
+	"strings"
 
 	"github.com/ollama/ollama/llm"
 )
@@ -58,11 +59,13 @@ type Converter interface {
 	KV(*Tokenizer) llm.KV
 	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
 	Tensors([]Tensor) []llm.Tensor
+	// Replacements returns a list of string pairs to replace in tensor names.
+	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
+	Replacements() []string
 
-	// tensorName returns the LLM tensor name for a specific input name
-	tensorName(string) string
 	// specialTokenTypes returns any special token types the model uses
 	specialTokenTypes() []string
+	// writeFile writes the model to the provided io.WriteSeeker
 	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
 }
 
@@ -97,6 +100,8 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
 		conv = &mixtral{}
 	case "GemmaForCausalLM":
 		conv = &gemma{}
+	case "Gemma2ForCausalLM":
+		conv = &gemma2{}
 	case "Phi3ForCausalLM":
 		conv = &phi3{}
 	case "BertModel":
@@ -131,7 +136,7 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
 		slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
 	}
 
-	ts, err := parseTensors(fsys)
+	ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
 	if err != nil {
 		return err
 	}
diff --git a/convert/convert_bert.go b/convert/convert_bert.go
index 62fad147..4547a705 100644
--- a/convert/convert_bert.go
+++ b/convert/convert_bert.go
@@ -144,9 +144,8 @@ func (p *bert) Tensors(ts []Tensor) []llm.Tensor {
 			continue
 		}
 
-		name := p.tensorName(t.Name())
 		out = append(out, llm.Tensor{
-			Name:     name,
+			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			WriterTo: t,
@@ -156,8 +155,8 @@ func (p *bert) Tensors(ts []Tensor) []llm.Tensor {
 	return out
 }
 
-func (bert) tensorName(n string) string {
-	return strings.NewReplacer(
+func (bert) Replacements() []string {
+	return []string{
 		"encoder.layer", "blk",
 		"encoder.layers", "blk",
 		"embeddings.word_embeddings", "token_embd",
@@ -172,5 +171,5 @@ func (bert) tensorName(n string) string {
 		"intermediate.dense", "ffn_up",
 		"output.dense", "ffn_down",
 		"output.LayerNorm", "layer_output_norm",
-	).Replace(n)
+	}
 }
diff --git a/convert/convert_gemma.go b/convert/convert_gemma.go
index 9213e157..333e4c83 100644
--- a/convert/convert_gemma.go
+++ b/convert/convert_gemma.go
@@ -44,15 +44,14 @@ func (p *gemma) KV(t *Tokenizer) llm.KV {
 }
 
 func (p *gemma) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+	out := make([]llm.Tensor, 0, len(ts))
 	for _, t := range ts {
-		name := p.tensorName(t.Name())
-		if strings.HasSuffix(name, "_norm.weight") {
+		if strings.HasSuffix(t.Name(), "_norm.weight") {
 			t.SetRepacker(p.addOne)
 		}
 
 		out = append(out, llm.Tensor{
-			Name:     name,
+			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			WriterTo: t,
@@ -62,8 +61,8 @@ func (p *gemma) Tensors(ts []Tensor) []llm.Tensor {
 	return out
 }
 
-func (p *gemma) tensorName(n string) string {
-	return strings.NewReplacer(
+func (p *gemma) Replacements() []string {
+	return []string{
 		"model.embed_tokens", "token_embd",
 		"model.norm", "output_norm",
 		"model.layers", "blk",
@@ -76,8 +75,7 @@ func (p *gemma) tensorName(n string) string {
 		"mlp.down_proj", "ffn_down",
 		"mlp.up_proj", "ffn_up",
 		"post_attention_layernorm", "ffn_norm",
-		"block_sparse_moe.gate", "ffn_inp",
-	).Replace(n)
+	}
 }
 
 func (*gemma) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
diff --git a/convert/convert_gemma2.go b/convert/convert_gemma2.go
new file mode 100644
index 00000000..66be02d6
--- /dev/null
+++ b/convert/convert_gemma2.go
@@ -0,0 +1,44 @@
+package convert
+
+import (
+	"github.com/ollama/ollama/llm"
+)
+
+type gemma2 struct {
+	gemma
+	SlidingWindow         uint32  `json:"sliding_window"`
+	AttentionLogitSoftcap float32 `json:"attn_logit_softcapping"`
+	FinalLogitSoftcap     float32 `json:"final_logit_softcapping"`
+}
+
+func (p *gemma2) KV(t *Tokenizer) llm.KV {
+	kv := p.Parameters.KV(t)
+	kv["general.architecture"] = "gemma2"
+	kv["general.name"] = "gemma2"
+	kv["gemma2.context_length"] = p.MaxPositionEmbeddings
+	kv["gemma2.embedding_length"] = p.HiddenSize
+	kv["gemma2.block_count"] = p.HiddenLayers
+	kv["gemma2.feed_forward_length"] = p.IntermediateSize
+	kv["gemma2.attention.head_count"] = p.NumAttentionHeads
+	kv["gemma2.attention.head_count_kv"] = p.NumKeyValueHeads
+	kv["gemma2.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
+	kv["gemma2.attention.key_length"] = p.HeadDim
+	kv["gemma2.attention.value_length"] = p.HeadDim
+	kv["gemma2.attention.sliding_window"] = p.SlidingWindow
+	kv["gemma2.attn_logit_softcapping"] = p.AttentionLogitSoftcap
+	kv["gemma2.final_logit_softcapping"] = p.FinalLogitSoftcap
+	kv["tokenizer.ggml.eot_token_id"] = uint32(107)
+	kv["tokenizer.ggml.middle_token_id"] = uint32(68)
+	kv["tokenizer.ggml.prefix_token_id"] = uint32(67)
+	kv["tokenizer.ggml.suffix_token_id"] = uint32(69)
+	return kv
+}
+
+func (p *gemma2) Replacements() []string {
+	return append(
+		p.gemma.Replacements(),
+		"post_attention_layernorm", "post_attention_norm",
+		"pre_feedforward_layernorm", "ffn_norm",
+		"post_feedforward_layernorm", "post_ffw_norm",
+	)
+}
diff --git a/convert/convert_llama.go b/convert/convert_llama.go
index 178b13f3..498d1321 100644
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -96,14 +96,13 @@ func (p *llama) KV(t *Tokenizer) llm.KV {
 func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
 	var out []llm.Tensor
 	for _, t := range ts {
-		name := p.tensorName(t.Name())
-		if strings.HasSuffix(name, "attn_q.weight") ||
-			strings.HasSuffix(name, "attn_k.weight") {
+		if strings.HasSuffix(t.Name(), "attn_q.weight") ||
+			strings.HasSuffix(t.Name(), "attn_k.weight") {
 			t.SetRepacker(p.repack)
 		}
 
 		out = append(out, llm.Tensor{
-			Name:     name,
+			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			WriterTo: t,
@@ -113,8 +112,8 @@ func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
 	return out
 }
 
-func (p *llama) tensorName(n string) string {
-	return strings.NewReplacer(
+func (p *llama) Replacements() []string {
+	return []string{
 		"lm_head", "output",
 		"model.embed_tokens", "token_embd",
 		"model.norm", "output_norm",
@@ -128,9 +127,7 @@ func (p *llama) tensorName(n string) string {
 		"mlp.down_proj", "ffn_down",
 		"mlp.up_proj", "ffn_up",
 		"post_attention_layernorm", "ffn_norm",
-		// mixtral
-		"block_sparse_moe.gate", "ffn_gate_inp",
-	).Replace(n)
+	}
 }
 
 func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32, error) {
@@ -140,9 +137,9 @@ func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32,
 	}
 
 	var heads uint32
-	if strings.HasSuffix(name, "q_proj.weight") {
+	if strings.HasSuffix(name, "attn_q.weight") {
 		heads = p.NumAttentionHeads
-	} else if strings.HasSuffix(name, "k_proj.weight") {
+	} else if strings.HasSuffix(name, "attn_k.weight") {
 		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
 	} else {
 		return nil, fmt.Errorf("unknown tensor for repack: %s", name)
diff --git a/convert/convert_mixtral.go b/convert/convert_mixtral.go
index 3263a27b..97a86b30 100644
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@@ -15,8 +15,6 @@ type mixtral struct {
 	NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
 }
 
-var _ Converter = (*mixtral)(nil)
-
 func (p *mixtral) KV(t *Tokenizer) llm.KV {
 	kv := p.llama.KV(t)
 
@@ -72,6 +70,13 @@ func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor {
 	return append(out, p.llama.Tensors(ts)...)
 }
 
+func (p *mixtral) Replacements() []string {
+	return append(
+		p.llama.Replacements(),
+		"block_sparse_moe.gate", "ffn_gate_inp",
+	)
+}
+
 type experts []Tensor
 
 func (e experts) WriteTo(w io.Writer) (int64, error) {
diff --git a/convert/convert_phi3.go b/convert/convert_phi3.go
index 0f645217..4ee59ff5 100644
--- a/convert/convert_phi3.go
+++ b/convert/convert_phi3.go
@@ -74,8 +74,7 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
 
 	out := make([]llm.Tensor, 0, len(ts)+2)
 	for _, t := range ts {
-		name := p.tensorName(t.Name())
-		if strings.HasPrefix(name, "blk.0.") {
+		if strings.HasPrefix(t.Name(), "blk.0.") {
 			addRopeFactors.Do(func() {
 				out = append(out, llm.Tensor{
 					Name:     "rope_factors_long.weight",
@@ -92,7 +91,7 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
 		}
 
 		out = append(out, llm.Tensor{
-			Name:     name,
+			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			WriterTo: t,
@@ -102,8 +101,8 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
 	return out
 }
 
-func (p *phi3) tensorName(n string) string {
-	return strings.NewReplacer(
+func (p *phi3) Replacements() []string {
+	return []string{
 		"lm_head", "output",
 		"model.embed_tokens", "token_embd",
 		"model.norm", "output_norm",
@@ -114,7 +113,7 @@ func (p *phi3) tensorName(n string) string {
 		"mlp.down_proj", "ffn_down",
 		"mlp.gate_up_proj", "ffn_up",
 		"post_attention_layernorm", "ffn_norm",
-	).Replace(n)
+	}
 }
 
 type ropeFactor []float32
diff --git a/convert/convert_test.go b/convert/convert_test.go
index e3ab0098..e78afab7 100644
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -68,6 +68,7 @@ func TestConvertFull(t *testing.T) {
 		// microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8
 		"Phi-3-mini-128k-instruct",
 		"all-MiniLM-L6-v2",
+		"gemma-2-9b-it",
 	}
 
 	for i := range cases {
diff --git a/convert/reader.go b/convert/reader.go
index 294a7c40..5bba0406 100644
--- a/convert/reader.go
+++ b/convert/reader.go
@@ -35,9 +35,9 @@ const (
 )
 
 func (t tensorBase) Kind() uint32 {
-	if strings.HasSuffix(t.name, ".block_sparse_moe.gate.weight") {
-		return 0
-	} else if t.name == "embeddings.token_type_embeddings.weight" {
+	if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
+		t.name == "token_types.weight" {
+		// these tensors are always F32
 		return 0
 	}
 
@@ -57,10 +57,10 @@ func (t *tensorBase) SetRepacker(fn repacker) {
 
 type repacker func(string, []float32, []uint64) ([]float32, error)
 
-func parseTensors(fsys fs.FS) ([]Tensor, error) {
+func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) {
 	patterns := []struct {
 		Pattern string
-		Func    func(fs.FS, ...string) ([]Tensor, error)
+		Func    func(fs.FS, *strings.Replacer, ...string) ([]Tensor, error)
 	}{
 		{"model-*-of-*.safetensors", parseSafetensors},
 		{"model.safetensors", parseSafetensors},
@@ -76,7 +76,7 @@ func parseTensors(fsys fs.FS) ([]Tensor, error) {
 		}
 
 		if len(matches) > 0 {
-			return pattern.Func(fsys, matches...)
+			return pattern.Func(fsys, replacer, matches...)
 		}
 	}
 
diff --git a/convert/reader_safetensors.go b/convert/reader_safetensors.go
index 42f902a5..32a362cd 100644
--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@@ -8,6 +8,7 @@ import (
 	"io"
 	"io/fs"
 	"slices"
+	"strings"
 
 	"github.com/d4l3k/go-bfloat16"
 	"github.com/x448/float16"
@@ -20,7 +21,7 @@ type safetensorMetadata struct {
 	Offsets []int64  `json:"data_offsets"`
 }
 
-func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) {
+func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) {
 	var ts []Tensor
 	for _, p := range ps {
 		f, err := fsys.Open(p)
@@ -56,7 +57,7 @@ func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) {
 					offset: safetensorsPad(n, value.Offsets[0]),
 					size:   safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]),
 					tensorBase: &tensorBase{
-						name:  key,
+						name:  replacer.Replace(key),
 						shape: value.Shape,
 					},
 				})
diff --git a/convert/reader_torch.go b/convert/reader_torch.go
index 531996bf..1b3e1c9f 100644
--- a/convert/reader_torch.go
+++ b/convert/reader_torch.go
@@ -3,12 +3,13 @@ package convert
 import (
 	"io"
 	"io/fs"
+	"strings"
 
 	"github.com/nlpodyssey/gopickle/pytorch"
 	"github.com/nlpodyssey/gopickle/types"
 )
 
-func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) {
+func parseTorch(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) {
 	var ts []Tensor
 	for _, p := range ps {
 		pt, err := pytorch.Load(p)
@@ -27,7 +28,7 @@ func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) {
 			ts = append(ts, torch{
 				storage: t.(*pytorch.Tensor).Source,
 				tensorBase: &tensorBase{
-					name:  k.(string),
+					name:  replacer.Replace(k.(string)),
 					shape: shape,
 				},
 			})
diff --git a/convert/testdata/gemma-2-9b-it.json b/convert/testdata/gemma-2-9b-it.json
new file mode 100644
index 00000000..90cdbee4
--- /dev/null
+++ b/convert/testdata/gemma-2-9b-it.json
@@ -0,0 +1,6 @@
+{
+  "general.architecture": "gemma2",
+  "gemma2.attention.sliding_window": "4096",
+  "gemma2.attn_logit_softcapping": "50",
+  "gemma2.final_logit_softcapping": "30"
+}
diff --git a/convert/tokenizer_spm.go b/convert/tokenizer_spm.go
index babf702c..5e506087 100644
--- a/convert/tokenizer_spm.go
+++ b/convert/tokenizer_spm.go
@@ -15,6 +15,11 @@ import (
 )
 
 func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
+	ast, err := parseAdditionalSpecialTokens(fsys)
+	if err != nil {
+		return nil, err
+	}
+
 	bts, err := fs.ReadFile(fsys, "tokenizer.model")
 	if err != nil {
 		return nil, err
@@ -37,7 +42,12 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
 			sentencepiece.ModelProto_SentencePiece_BYTE:
 			v.Types = append(v.Types, int32(t))
 		default:
-			v.Types = append(v.Types, int32(sentencepiece.ModelProto_SentencePiece_NORMAL))
+			tt := int32(sentencepiece.ModelProto_SentencePiece_NORMAL)
+			if slices.Contains(ast, piece.GetPiece()) {
+				tt = int32(sentencepiece.ModelProto_SentencePiece_CONTROL)
+			}
+
+			v.Types = append(v.Types, tt)
 		}
 	}
 
@@ -81,3 +91,23 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
 
 	return &v, nil
 }
+
+func parseAdditionalSpecialTokens(fsys fs.FS) ([]string, error) {
+	f, err := fsys.Open("special_tokens_map.json")
+	if errors.Is(err, os.ErrNotExist) {
+		return nil, nil
+	} else if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	var m struct {
+		AdditionalSpecialTokens []string `json:"additional_special_tokens"`
+	}
+
+	if err := json.NewDecoder(f).Decode(&m); err != nil {
+		return nil, err
+	}
+
+	return m.AdditionalSpecialTokens, nil
+}

From 77903ab8b4fb8075faad7bde5bde2eee3173e407 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 29 Jul 2024 14:53:02 -0700
Subject: [PATCH 70/71] llama3.1

---
 convert/convert_bert.go                       |  1 -
 convert/convert_gemma.go                      |  1 -
 convert/convert_gemma2.go                     |  1 -
 convert/convert_llama.go                      | 43 +++++++++++++++++--
 convert/convert_phi3.go                       |  1 -
 convert/convert_test.go                       |  1 +
 .../testdata/Meta-Llama-3.1-8B-Instruct.json  |  3 ++
 llm/memory_test.go                            |  1 -
 server/sched_test.go                          |  1 -
 9 files changed, 44 insertions(+), 9 deletions(-)
 create mode 100644 convert/testdata/Meta-Llama-3.1-8B-Instruct.json

diff --git a/convert/convert_bert.go b/convert/convert_bert.go
index 4547a705..6e7d59fe 100644
--- a/convert/convert_bert.go
+++ b/convert/convert_bert.go
@@ -88,7 +88,6 @@ func (p *bert) parseMore(fsys fs.FS) error {
 func (p *bert) KV(t *Tokenizer) llm.KV {
 	kv := p.Parameters.KV(t)
 	kv["general.architecture"] = "bert"
-	kv["general.name"] = "bert"
 	kv["bert.attention.causal"] = false
 	kv["bert.pooling_type"] = p.PoolingType
 
diff --git a/convert/convert_gemma.go b/convert/convert_gemma.go
index 333e4c83..c4316808 100644
--- a/convert/convert_gemma.go
+++ b/convert/convert_gemma.go
@@ -26,7 +26,6 @@ var _ Converter = (*gemma)(nil)
 func (p *gemma) KV(t *Tokenizer) llm.KV {
 	kv := p.Parameters.KV(t)
 	kv["general.architecture"] = "gemma"
-	kv["general.name"] = "gemma"
 	kv["gemma.context_length"] = p.MaxPositionEmbeddings
 	kv["gemma.embedding_length"] = p.HiddenSize
 	kv["gemma.block_count"] = p.HiddenLayers
diff --git a/convert/convert_gemma2.go b/convert/convert_gemma2.go
index 66be02d6..084f9c52 100644
--- a/convert/convert_gemma2.go
+++ b/convert/convert_gemma2.go
@@ -14,7 +14,6 @@ type gemma2 struct {
 func (p *gemma2) KV(t *Tokenizer) llm.KV {
 	kv := p.Parameters.KV(t)
 	kv["general.architecture"] = "gemma2"
-	kv["general.name"] = "gemma2"
 	kv["gemma2.context_length"] = p.MaxPositionEmbeddings
 	kv["gemma2.embedding_length"] = p.HiddenSize
 	kv["gemma2.block_count"] = p.HiddenLayers
diff --git a/convert/convert_llama.go b/convert/convert_llama.go
index 498d1321..27f924fb 100644
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -3,6 +3,7 @@ package convert
 import (
 	"cmp"
 	"fmt"
+	"math"
 	"strings"
 
 	"github.com/pdevine/tensor"
@@ -27,8 +28,14 @@ type llama struct {
 	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
 	RopeTheta             float32 `json:"rope_theta"`
 	RopeScaling           struct {
-		Type   string  `json:"type"`
-		Factor float32 `json:"factor"`
+		Type                            string  `json:"type"`
+		RopeType                        string  `json:"rope_type"`
+		Factor                          float32 `json:"factor"`
+		LowFrequencyFactor              float32 `json:"low_freq_factor"`
+		HighFrequencyFactor             float32 `json:"high_freq_factor"`
+		OriginalMaxPositionalEmbeddings uint32  `json:"original_max_positional_embeddings"`
+
+		factors ropeFactor
 	} `json:"rope_scaling"`
 	RMSNormEPS       float32 `json:"rms_norm_eps"`
 	LayerNormEPS     float32 `json:"layer_norm_eps"`
@@ -42,7 +49,6 @@ var _ Converter = (*llama)(nil)
 func (p *llama) KV(t *Tokenizer) llm.KV {
 	kv := p.Parameters.KV(t)
 	kv["general.architecture"] = "llama"
-	kv["general.name"] = "llama"
 	kv["llama.vocab_size"] = p.VocabSize
 
 	kv["llama.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
@@ -71,6 +77,27 @@ func (p *llama) KV(t *Tokenizer) llm.KV {
 	if p.RopeScaling.Type == "linear" {
 		kv["llama.rope.scaling.type"] = p.RopeScaling.Type
 		kv["llama.rope.scaling.factor"] = p.RopeScaling.Factor
+	} else if p.RopeScaling.RopeType == "llama3" {
+		dim := p.HiddenSize / p.NumAttentionHeads
+		for i := uint32(0); i < dim; i += 2 {
+			factor := cmp.Or(p.RopeScaling.Factor, 8.0)
+			factorLow := cmp.Or(p.RopeScaling.LowFrequencyFactor, 1.0)
+			factorHigh := cmp.Or(p.RopeScaling.HighFrequencyFactor, 4.0)
+
+			original := cmp.Or(p.RopeScaling.OriginalMaxPositionalEmbeddings, 8192)
+			lambdaLow := float32(original) / factorLow
+			lambdaHigh := float32(original) / factorHigh
+
+			lambda := 2 * math.Pi * math.Pow(float64(p.RopeTheta), float64(i)/float64(dim))
+			if lambda < float64(lambdaHigh) {
+				p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0)
+			} else if lambda > float64(lambdaLow) {
+				p.RopeScaling.factors = append(p.RopeScaling.factors, factor)
+			} else {
+				smooth := (float32(original)/float32(lambda) - factorLow) / (factorHigh - factorLow)
+				p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0/((1-smooth)/factor+smooth))
+			}
+		}
 	}
 
 	if p.NumKeyValueHeads > 0 {
@@ -95,6 +122,16 @@ func (p *llama) KV(t *Tokenizer) llm.KV {
 
 func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
 	var out []llm.Tensor
+
+	if p.RopeScaling.factors != nil {
+		out = append(out, llm.Tensor{
+			Name:     "rope_freqs.weight",
+			Kind:     0,
+			Shape:    []uint64{uint64(len(p.RopeScaling.factors))},
+			WriterTo: p.RopeScaling.factors,
+		})
+	}
+
 	for _, t := range ts {
 		if strings.HasSuffix(t.Name(), "attn_q.weight") ||
 			strings.HasSuffix(t.Name(), "attn_k.weight") {
diff --git a/convert/convert_phi3.go b/convert/convert_phi3.go
index 4ee59ff5..64d3d012 100644
--- a/convert/convert_phi3.go
+++ b/convert/convert_phi3.go
@@ -40,7 +40,6 @@ var _ Converter = (*phi3)(nil)
 func (p *phi3) KV(t *Tokenizer) llm.KV {
 	kv := p.Parameters.KV(t)
 	kv["general.architecture"] = "phi3"
-	kv["general.name"] = "phi3"
 	kv["phi3.context_length"] = p.MaxPositionEmbeddings
 	kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
 	kv["phi3.feed_forward_length"] = p.IntermediateSize
diff --git a/convert/convert_test.go b/convert/convert_test.go
index e78afab7..64b7df3b 100644
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -62,6 +62,7 @@ func TestMain(m *testing.M) {
 func TestConvertFull(t *testing.T) {
 	cases := []string{
 		"Meta-Llama-3-8B-Instruct",
+		"Meta-Llama-3.1-8B-Instruct",
 		"Mistral-7B-Instruct-v0.2",
 		"Mixtral-8x7B-Instruct-v0.1",
 		"gemma-2b-it",
diff --git a/convert/testdata/Meta-Llama-3.1-8B-Instruct.json b/convert/testdata/Meta-Llama-3.1-8B-Instruct.json
new file mode 100644
index 00000000..ad7cd20a
--- /dev/null
+++ b/convert/testdata/Meta-Llama-3.1-8B-Instruct.json
@@ -0,0 +1,3 @@
+{
+  "rope_freqs.weight": "80fd5efb2f729381785b293a091a268cfeceb0079167f6ece9b07070e662b222"
+}
diff --git a/llm/memory_test.go b/llm/memory_test.go
index 6cf0119f..ffb14286 100644
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -33,7 +33,6 @@ func TestEstimateGPULayers(t *testing.T) {
 	assert.Len(t, tensors, inputLayerCount+1)
 	err = WriteGGUF(f, KV{
 		"general.architecture":          "llama",
-		"general.name":                  "name",
 		"llama.context_length":          uint32(32),
 		"llama.embedding_length":        uint32(4096),
 		"llama.block_count":             uint32(inputLayerCount),
diff --git a/server/sched_test.go b/server/sched_test.go
index 713b9259..fb049574 100644
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -117,7 +117,6 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 
 	require.NoError(t, llm.WriteGGUF(f, llm.KV{
 		"general.architecture":          "llama",
-		"general.name":                  "name",
 		"llama.context_length":          uint32(32),
 		"llama.embedding_length":        uint32(4096),
 		"llama.block_count":             uint32(1),

From 90ca84172c2a98ecfd76eb7e05cd3e33e1dde507 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Thu, 22 Aug 2024 14:51:42 -0700
Subject: [PATCH 71/71] Fix embeddings memory corruption (#6467)

* Fix embeddings memory corruption

The patch was leading to a buffer overrun corruption.  Once removed though, parallism
in server.cpp lead to hitting an assert due to slot/seq IDs being >= token count.  To
work around this, only use slot 0 for embeddings.

* Fix embed integration test assumption

The token eval count has changed with recent llama.cpp bumps (0.3.5+)
---
 integration/embed_test.go   |  8 ++---
 llm/ext_server/server.cpp   |  8 ++++-
 llm/patches/08-pooling.diff | 60 -------------------------------------
 server/sched.go             |  5 ++++
 4 files changed, 16 insertions(+), 65 deletions(-)
 delete mode 100644 llm/patches/08-pooling.diff

diff --git a/integration/embed_test.go b/integration/embed_test.go
index 10333d5d..4a68af68 100644
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@@ -70,8 +70,8 @@ func TestAllMiniLMEmbed(t *testing.T) {
 		t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
 	}
 
-	if res.PromptEvalCount != 8 {
-		t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
+	if res.PromptEvalCount != 6 {
+		t.Fatalf("expected 6 prompt tokens, got %d", res.PromptEvalCount)
 	}
 }
 
@@ -102,8 +102,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
 		t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
 	}
 
-	if res.PromptEvalCount != 16 {
-		t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
+	if res.PromptEvalCount != 12 {
+		t.Fatalf("expected 12 prompt tokens, got %d", res.PromptEvalCount)
 	}
 }
 
diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp
index 5717c17a..8e08b850 100644
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -1429,7 +1429,13 @@ struct llama_server_context
         switch (task.type)
         {
             case TASK_TYPE_COMPLETION: {
-                server_slot *slot = prefix_slot(task.data["prompt"]);
+                server_slot *slot = nullptr;
+                if (task.embedding_mode) {
+                    // Embedding seq_id (aka slot id) must always be <= token length, so always use slot 0
+                    slot = slots[0].available() ? &slots[0] : nullptr;
+                } else {
+                    slot = prefix_slot(task.data["prompt"]);
+                }
                 if (slot == nullptr)
                 {
                     // if no slot is available, we defer this task for processing later
diff --git a/llm/patches/08-pooling.diff b/llm/patches/08-pooling.diff
deleted file mode 100644
index 2e4fe11e..00000000
--- a/llm/patches/08-pooling.diff
+++ /dev/null
@@ -1,60 +0,0 @@
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 721b8f4e..cfe7ac40 100644
---- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -8420,14 +8420,14 @@ struct llm_build_context {
-     }
- 
-     struct ggml_tensor * build_inp_mean() {
--        lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
-+        lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, cparams.n_seq_max);
-         cb(lctx.inp_mean, "inp_mean", -1);
-         ggml_set_input(lctx.inp_mean);
-         return lctx.inp_mean;
-     }
- 
-     struct ggml_tensor * build_inp_cls() {
--        lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-+        lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cparams.n_seq_max);
-         cb(lctx.inp_cls, "inp_cls", -1);
-         ggml_set_input(lctx.inp_cls);
-         return lctx.inp_cls;
-@@ -13847,19 +13847,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
-         GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
- 
-         float * data = (float *) lctx.inp_mean->data;
--        memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
-+        memset(lctx.inp_mean->data, 0, n_tokens * cparams.n_seq_max * ggml_element_size(lctx.inp_mean));
- 
-         std::vector<uint64_t> sum(n_tokens, 0);
-         for (int i = 0; i < n_tokens; ++i) {
-             const llama_seq_id seq_id = batch.seq_id[i][0];
--
--            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
--
-             sum[seq_id] += 1;
-         }
- 
--        std::vector<float> div(n_tokens, 0.0f);
--        for (int i = 0; i < n_tokens; ++i) {
-+        std::vector<float> div(cparams.n_seq_max, 0.0f);
-+        for (uint32_t i = 0; i < cparams.n_seq_max; ++i) {
-             const uint64_t s = sum[i];
-             if (s > 0) {
-                 div[i] = 1.0f/float(s);
-@@ -13879,14 +13876,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
-         GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
- 
-         uint32_t * data = (uint32_t *) lctx.inp_cls->data;
--        memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
-+        memset(lctx.inp_cls->data, 0, cparams.n_seq_max * ggml_element_size(lctx.inp_cls));
- 
-         for (int i = 0; i < n_tokens; ++i) {
-             const llama_seq_id seq_id = batch.seq_id[i][0];
-             const llama_pos    pos    = batch.pos[i];
--
--            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
--
-             if (pos == 0) {
-                 data[seq_id] = i;
-             }
diff --git a/server/sched.go b/server/sched.go
index 9d8c4144..58071bf0 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -193,6 +193,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						break
 					}
 
+					// Embedding models should always be loaded with parallel=1
+					if pending.model.CheckCapabilities(CapabilityCompletion) != nil {
+						numParallel = 1
+					}
+
 					// Evaluate if the model will fit in the available system memory, or if we should unload a model first
 					if len(gpus) == 1 && gpus[0].Library == "cpu" {
 						// simplifying assumption of defaultParallel when in CPU mode