From ebc529cbb3f0b27f6c154fa90e724db8243a7614 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Fri, 5 Jul 2024 17:31:23 -0700
Subject: [PATCH 01/79] autodetect stop parameters from template

---
 server/model.go                      | 21 ++++++++++++++++++---
 server/routes_create_test.go         |  3 ++-
 template/alfred.json                 |  8 ++++++++
 template/alpaca.json                 |  6 ++++++
 template/chatml.json                 |  6 ++++++
 template/chatqa.json                 |  8 ++++++++
 template/codellama-70b-instruct.json |  7 +++++++
 template/falcon-instruct.json        |  6 ++++++
 template/gemma-instruct.json         |  6 ++++++
 template/granite-instruct.json       |  7 +++++++
 template/llama2-chat.json            |  8 ++++++++
 template/llama3-instruct.json        |  7 +++++++
 template/magicoder.json              |  6 ++++++
 template/mistral-instruct.json       |  6 ++++++
 template/openchat.json               |  5 +++++
 template/phi-3.json                  |  8 ++++++++
 template/solar-instruct.json         |  7 +++++++
 template/starcoder2-instruct.json    |  7 +++++++
 template/template.go                 | 14 ++++++++++++++
 template/vicuna.json                 |  6 ++++++
 template/zephyr.json                 |  8 ++++++++
 21 files changed, 156 insertions(+), 4 deletions(-)
 create mode 100644 template/alfred.json
 create mode 100644 template/alpaca.json
 create mode 100644 template/chatml.json
 create mode 100644 template/chatqa.json
 create mode 100644 template/codellama-70b-instruct.json
 create mode 100644 template/falcon-instruct.json
 create mode 100644 template/gemma-instruct.json
 create mode 100644 template/granite-instruct.json
 create mode 100644 template/llama2-chat.json
 create mode 100644 template/llama3-instruct.json
 create mode 100644 template/magicoder.json
 create mode 100644 template/mistral-instruct.json
 create mode 100644 template/openchat.json
 create mode 100644 template/phi-3.json
 create mode 100644 template/solar-instruct.json
 create mode 100644 template/starcoder2-instruct.json
 create mode 100644 template/vicuna.json
 create mode 100644 template/zephyr.json

diff --git a/server/model.go b/server/model.go
index a79f549a..d33ffaec 100644
--- a/server/model.go
+++ b/server/model.go
@@ -4,6 +4,7 @@ import (
 	"archive/zip"
 	"bytes"
 	"context"
+	"encoding/json"
 	"errors"
 	"fmt"
 	"io"
@@ -259,13 +260,27 @@ func detectChatTemplate(layers []*layerGGML) ([]*layerGGML, error) {
 			if t, err := template.Named(s); err != nil {
 				slog.Debug("template detection", "error", err)
 			} else {
-				tmpl, err := NewLayer(t.Reader(), "application/vnd.ollama.image.template")
+				layer, err := NewLayer(t.Reader(), "application/vnd.ollama.image.template")
 				if err != nil {
 					return nil, err
 				}
 
-				tmpl.status = fmt.Sprintf("using autodetected template %s", t.Name)
-				layers = append(layers, &layerGGML{tmpl, nil})
+				layer.status = fmt.Sprintf("using autodetected template %s", t.Name)
+				layers = append(layers, &layerGGML{layer, nil})
+
+				if t.Parameters != nil {
+					var b bytes.Buffer
+					if err := json.NewEncoder(&b).Encode(t.Parameters); err != nil {
+						return nil, err
+					}
+
+					layer, err := NewLayer(&b, "application/vnd.ollama.image.params")
+					if err != nil {
+						return nil, err
+					}
+
+					layers = append(layers, &layerGGML{layer, nil})
+				}
 			}
 		}
 	}
diff --git a/server/routes_create_test.go b/server/routes_create_test.go
index 04174b92..84672087 100644
--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@@ -545,9 +545,10 @@ func TestCreateDetectTemplate(t *testing.T) {
 		}
 
 		checkFileExists(t, filepath.Join(p, "blobs", "*"), []string{
+			filepath.Join(p, "blobs", "sha256-0d79f567714c62c048378f2107fb332dabee0135d080c302d884317da9433cc5"),
 			filepath.Join(p, "blobs", "sha256-553c4a3f747b3d22a4946875f1cc8ed011c2930d83f864a0c7265f9ec0a20413"),
 			filepath.Join(p, "blobs", "sha256-c608dc615584cd20d9d830363dabf8a4783ae5d34245c3d8c115edb3bc7b28e4"),
-			filepath.Join(p, "blobs", "sha256-f836ee110db21567f826332e4cedd746c06d10664fd5a9ea3659e3683a944510"),
+			filepath.Join(p, "blobs", "sha256-ea34c57ba5b78b740aafe2aeb74dc6507fc3ad14170b64c26a04fb9e36c88d75"),
 		})
 	})
 
diff --git a/template/alfred.json b/template/alfred.json
new file mode 100644
index 00000000..edac21af
--- /dev/null
+++ b/template/alfred.json
@@ -0,0 +1,8 @@
+{
+  "stop": [
+    "<start_system>",
+    "<end_message>",
+    "<start_user>",
+    "<start_assistant>"
+  ]
+}
diff --git a/template/alpaca.json b/template/alpaca.json
new file mode 100644
index 00000000..eafe2b8a
--- /dev/null
+++ b/template/alpaca.json
@@ -0,0 +1,6 @@
+{
+  "stop": [
+    "### Instruction:",
+    "### Response"
+  ]
+}
diff --git a/template/chatml.json b/template/chatml.json
new file mode 100644
index 00000000..7afeb3de
--- /dev/null
+++ b/template/chatml.json
@@ -0,0 +1,6 @@
+{
+  "stop": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ]
+}
diff --git a/template/chatqa.json b/template/chatqa.json
new file mode 100644
index 00000000..64dd0f33
--- /dev/null
+++ b/template/chatqa.json
@@ -0,0 +1,8 @@
+{
+  "stop": [
+    "System:",
+    "User:",
+    "Assistant:",
+    "<|begin_of_text|>"
+  ]
+}
diff --git a/template/codellama-70b-instruct.json b/template/codellama-70b-instruct.json
new file mode 100644
index 00000000..a56a63f1
--- /dev/null
+++ b/template/codellama-70b-instruct.json
@@ -0,0 +1,7 @@
+{
+  "stop": [
+    "Source:",
+    "Destination:",
+    "<step>"
+  ]
+}
diff --git a/template/falcon-instruct.json b/template/falcon-instruct.json
new file mode 100644
index 00000000..a0da0e81
--- /dev/null
+++ b/template/falcon-instruct.json
@@ -0,0 +1,6 @@
+{
+  "stop": [
+    "User:",
+    "Assistant:"
+  ]
+}
diff --git a/template/gemma-instruct.json b/template/gemma-instruct.json
new file mode 100644
index 00000000..f4ad415c
--- /dev/null
+++ b/template/gemma-instruct.json
@@ -0,0 +1,6 @@
+{
+  "stop": [
+    "<start_of_turn>",
+    "<end_of_turn>"
+  ]
+}
diff --git a/template/granite-instruct.json b/template/granite-instruct.json
new file mode 100644
index 00000000..0933e4b5
--- /dev/null
+++ b/template/granite-instruct.json
@@ -0,0 +1,7 @@
+{
+  "stop": [
+    "System:",
+    "Question:",
+    "Answer:"
+  ]
+}
diff --git a/template/llama2-chat.json b/template/llama2-chat.json
new file mode 100644
index 00000000..17590ab4
--- /dev/null
+++ b/template/llama2-chat.json
@@ -0,0 +1,8 @@
+{
+  "stop": [
+    "[INST]",
+    "[/INST]",
+    "<<SYS>>",
+    "<</SYS>>"
+  ]
+}
diff --git a/template/llama3-instruct.json b/template/llama3-instruct.json
new file mode 100644
index 00000000..c4e9d448
--- /dev/null
+++ b/template/llama3-instruct.json
@@ -0,0 +1,7 @@
+{
+  "stop": [
+    "<|start_header_id|>",
+    "<|end_header_id|>",
+    "<|eot_id|>"
+  ]
+}
diff --git a/template/magicoder.json b/template/magicoder.json
new file mode 100644
index 00000000..6f67cab0
--- /dev/null
+++ b/template/magicoder.json
@@ -0,0 +1,6 @@
+{
+  "stop": [
+    "@@ Instruction",
+    "@@ Response"
+  ]
+}
diff --git a/template/mistral-instruct.json b/template/mistral-instruct.json
new file mode 100644
index 00000000..7afeb3de
--- /dev/null
+++ b/template/mistral-instruct.json
@@ -0,0 +1,6 @@
+{
+  "stop": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ]
+}
diff --git a/template/openchat.json b/template/openchat.json
new file mode 100644
index 00000000..0edc341f
--- /dev/null
+++ b/template/openchat.json
@@ -0,0 +1,5 @@
+{
+  "stop": [
+    "<|end_of_turn|>"
+  ]
+}
diff --git a/template/phi-3.json b/template/phi-3.json
new file mode 100644
index 00000000..27bf7664
--- /dev/null
+++ b/template/phi-3.json
@@ -0,0 +1,8 @@
+{
+  "stop": [
+    "<|end|>",
+    "<|system|>",
+    "<|user|>",
+    "<|assistant|>"
+  ]
+}
diff --git a/template/solar-instruct.json b/template/solar-instruct.json
new file mode 100644
index 00000000..7b7a9050
--- /dev/null
+++ b/template/solar-instruct.json
@@ -0,0 +1,7 @@
+{
+  "stop": [
+    "### System:",
+    "### User:",
+    "### Assistant"
+  ]
+}
diff --git a/template/starcoder2-instruct.json b/template/starcoder2-instruct.json
new file mode 100644
index 00000000..31348908
--- /dev/null
+++ b/template/starcoder2-instruct.json
@@ -0,0 +1,7 @@
+{
+  "stop": [
+    "### Instruction",
+    "### Response",
+    "<|endoftext|>"
+  ]
+}
diff --git a/template/template.go b/template/template.go
index 9b351666..9bb6a399 100644
--- a/template/template.go
+++ b/template/template.go
@@ -23,6 +23,7 @@ import (
 var indexBytes []byte
 
 //go:embed *.gotmpl
+//go:embed *.json
 var templatesFS embed.FS
 
 var templatesOnce = sync.OnceValues(func() ([]*named, error) {
@@ -39,6 +40,15 @@ var templatesOnce = sync.OnceValues(func() ([]*named, error) {
 
 		// normalize line endings
 		t.Bytes = bytes.ReplaceAll(bts, []byte("\r\n"), []byte("\n"))
+
+		params, err := templatesFS.ReadFile(t.Name + ".json")
+		if err != nil {
+			continue
+		}
+
+		if err := json.Unmarshal(params, &t.Parameters); err != nil {
+			return nil, err
+		}
 	}
 
 	return templates, nil
@@ -48,6 +58,10 @@ type named struct {
 	Name     string `json:"name"`
 	Template string `json:"template"`
 	Bytes    []byte
+
+	Parameters *struct {
+		Stop []string `json:"stop"`
+	}
 }
 
 func (t named) Reader() io.Reader {
diff --git a/template/vicuna.json b/template/vicuna.json
new file mode 100644
index 00000000..ed7bfb0f
--- /dev/null
+++ b/template/vicuna.json
@@ -0,0 +1,6 @@
+{
+  "stop": [
+    "USER:",
+    "ASSISTANT:"
+  ]
+}
diff --git a/template/zephyr.json b/template/zephyr.json
new file mode 100644
index 00000000..f9c0115c
--- /dev/null
+++ b/template/zephyr.json
@@ -0,0 +1,8 @@
+{
+  "stop": [
+    "<|system|>",
+    "</s>",
+    "<|user|>",
+    "<|assistant|>"
+  ]
+}

From f02f83660c2e6f0741932bb31a28b82950144dfc Mon Sep 17 00:00:00 2001
From: lreed <lreed@ip-10-244-31-12.ec2.internal>
Date: Wed, 17 Jul 2024 21:44:19 +0000
Subject: [PATCH 02/79] bump go version to 1.22.5 to fix security
 vulnerabilities

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index ca393496..c8efdd8a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-ARG GOLANG_VERSION=1.22.1
+ARG GOLANG_VERSION=1.22.5
 ARG CMAKE_VERSION=3.22.1
 # this CUDA_VERSION corresponds with the one specified in docs/gpu.md
 ARG CUDA_VERSION=11.3.1

From a3c20e3f181607760ee86893baaf31b3c7fd3012 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Mon, 22 Jul 2024 08:52:16 -0700
Subject: [PATCH 03/79] Refine error reporting for subprocess crash

On windows, the exit status winds up being the search term many
users search for and end up piling in on issues that are unrelated.
This refines the reporting so that if we have a more detailed message
we'll suppress the exit status portion of the message.
---
 llm/server.go | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/llm/server.go b/llm/server.go
index ba7eab03..08463ef0 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -417,7 +417,17 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
 		// reap subprocess when it exits
 		go func() {
-			s.done <- s.cmd.Wait()
+			err := s.cmd.Wait()
+			// Favor a more detailed message over the process exit status
+			if err != nil && s.status != nil && s.status.LastErrMsg != "" {
+				slog.Debug("llama runner terminated", "error", err)
+				if strings.Contains(s.status.LastErrMsg, "unknown model") {
+					s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade"
+				}
+				s.done <- fmt.Errorf(s.status.LastErrMsg)
+			} else {
+				s.done <- err
+			}
 		}()
 
 		return s, nil
@@ -580,14 +590,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 			slog.Warn("client connection closed before server finished loading, aborting load")
 			return fmt.Errorf("timed out waiting for llama runner to start: %w", ctx.Err())
 		case err := <-s.done:
-			msg := ""
-			if s.status != nil && s.status.LastErrMsg != "" {
-				msg = s.status.LastErrMsg
-			}
-			if strings.Contains(msg, "unknown model") {
-				return fmt.Errorf("this model is not supported by your version of Ollama. You may need to upgrade")
-			}
-			return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
+			return fmt.Errorf("llama runner process has terminated: %w", err)
 		default:
 		}
 		if time.Now().After(stallTimer) {

From cc269ba0943ee1fa0bddcce8027d0a6d1b86fec5 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Mon, 22 Jul 2024 09:08:11 -0700
Subject: [PATCH 04/79] Remove no longer supported max vram var

The OLLAMA_MAX_VRAM env var was a temporary workaround for OOM
scenarios.  With Concurrency this was no longer wired up, and the simplistic
value doesn't map to multi-GPU setups.  Users can still set `num_gpu`
to limit memory usage to avoid OOM if we get our predictions wrong.
---
 cmd/cmd.go                      |  1 -
 envconfig/config.go             | 13 -------------
 integration/concurrency_test.go |  4 ++--
 3 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index 2252a905..b761d018 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1344,7 +1344,6 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_TMPDIR"],
 				envVars["OLLAMA_FLASH_ATTENTION"],
 				envVars["OLLAMA_LLM_LIBRARY"],
-				envVars["OLLAMA_MAX_VRAM"],
 			})
 		default:
 			appendEnvDocs(cmd, envs)
diff --git a/envconfig/config.go b/envconfig/config.go
index 62d661eb..0abc6968 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -43,8 +43,6 @@ var (
 	MaxRunners int
 	// Set via OLLAMA_MAX_QUEUE in the environment
 	MaxQueuedRequests int
-	// Set via OLLAMA_MAX_VRAM in the environment
-	MaxVRAM uint64
 	// Set via OLLAMA_MODELS in the environment
 	ModelsDir string
 	// Set via OLLAMA_NOHISTORY in the environment
@@ -89,7 +87,6 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
 		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
 		"OLLAMA_MAX_QUEUE":         {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
-		"OLLAMA_MAX_VRAM":          {"OLLAMA_MAX_VRAM", MaxVRAM, "Maximum VRAM"},
 		"OLLAMA_MODELS":            {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"},
 		"OLLAMA_NOHISTORY":         {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
 		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
@@ -194,16 +191,6 @@ func LoadConfig() {
 
 	TmpDir = clean("OLLAMA_TMPDIR")
 
-	userLimit := clean("OLLAMA_MAX_VRAM")
-	if userLimit != "" {
-		avail, err := strconv.ParseUint(userLimit, 10, 64)
-		if err != nil {
-			slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
-		} else {
-			MaxVRAM = avail
-		}
-	}
-
 	LLMLibrary = clean("OLLAMA_LLM_LIBRARY")
 
 	if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" {
diff --git a/integration/concurrency_test.go b/integration/concurrency_test.go
index d66ba9f0..8593285b 100644
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -69,7 +69,7 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
 	reqLimit := len(req)
 	iterLimit := 5
 
-	vram := os.Getenv("OLLAMA_MAX_VRAM")
+	vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
 	if vram != "" {
 		max, err := strconv.ParseUint(vram, 10, 64)
 		require.NoError(t, err)
@@ -106,7 +106,7 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
 
 // Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
 func TestMultiModelStress(t *testing.T) {
-	vram := os.Getenv("OLLAMA_MAX_VRAM")
+	vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
 	if vram == "" {
 		t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
 	}

From b3e5491e41811294de9d81649a96581af6522d08 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Mon, 22 Jul 2024 12:38:03 -0400
Subject: [PATCH 05/79] server: collect nested tool call objects when parsing
 (#5824)

---
 server/model.go                   | 43 +++++++++++++++++++++--------
 server/model_test.go              |  1 +
 server/routes.go                  |  4 +--
 server/testdata/tools/xlam.gotmpl | 45 +++++++++++++++++++++++++++++++
 server/testdata/tools/xlam.out    | 40 +++++++++++++++++++++++++++
 5 files changed, 120 insertions(+), 13 deletions(-)
 create mode 100644 server/testdata/tools/xlam.gotmpl
 create mode 100644 server/testdata/tools/xlam.out

diff --git a/server/model.go b/server/model.go
index a084dd8c..bf38c415 100644
--- a/server/model.go
+++ b/server/model.go
@@ -344,6 +344,10 @@ func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
 		}
 	}
 
+	if name == "" || arguments == "" {
+		return nil, false
+	}
+
 	var objs []map[string]any
 	for offset := 0; offset < len(s); {
 		var obj map[string]any
@@ -361,23 +365,40 @@ func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
 			return nil, false
 		} else {
 			offset += int(decoder.InputOffset())
-			objs = append(objs, obj)
+
+			// collect all nested objects
+			var collect func(any) []map[string]any
+			collect = func(obj any) (all []map[string]any) {
+				switch o := obj.(type) {
+				case map[string]any:
+					all = append(all, o)
+					for _, v := range o {
+						all = append(all, collect(v)...)
+					}
+				case []any:
+					for _, v := range o {
+						all = append(all, collect(v)...)
+					}
+				}
+
+				return all
+			}
+			objs = append(objs, collect(obj)...)
 		}
 	}
 
 	var toolCalls []api.ToolCall
 	for _, kv := range objs {
-		var call api.ToolCall
-		for k, v := range kv {
-			switch k {
-			case name:
-				call.Function.Name = v.(string)
-			case arguments:
-				call.Function.Arguments = v.(map[string]any)
-			}
+		n, nok := kv[name].(string)
+		a, aok := kv[arguments].(map[string]any)
+		if nok && aok {
+			toolCalls = append(toolCalls, api.ToolCall{
+				Function: api.ToolCallFunction{
+					Name:      n,
+					Arguments: a,
+				},
+			})
 		}
-
-		toolCalls = append(toolCalls, call)
 	}
 
 	return toolCalls, len(toolCalls) > 0
diff --git a/server/model_test.go b/server/model_test.go
index 7c826b06..5829adfc 100644
--- a/server/model_test.go
+++ b/server/model_test.go
@@ -166,6 +166,7 @@ The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`,
 {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}
 {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}
 </tool_call>`, true},
+		{"xlam", `{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]}`, true},
 	}
 
 	var tools []api.Tool
diff --git a/server/routes.go b/server/routes.go
index 85db7924..0d7ca003 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -611,10 +611,10 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
 		quantization := cmp.Or(r.Quantize, r.Quantization)
 		if err := CreateModel(ctx, name, filepath.Dir(r.Path), strings.ToUpper(quantization), f, fn); err != nil {
 			if errors.Is(err, errBadTemplate) {
-			  ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest}
+				ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest}
 			}
 			ch <- gin.H{"error": err.Error()}
-		  }
+		}
 	}()
 
 	if r.Stream != nil && !*r.Stream {
diff --git a/server/testdata/tools/xlam.gotmpl b/server/testdata/tools/xlam.gotmpl
new file mode 100644
index 00000000..51513d69
--- /dev/null
+++ b/server/testdata/tools/xlam.gotmpl
@@ -0,0 +1,45 @@
+{{- if .System }}{{ .System }}
+{{ end }}
+{{- range $i, $_ := .Messages }}
+{{- if eq .Role "user" }}### Instruction:
+{{- if and $.Tools (le (len (slice $.Messages $i)) 2) }}
+[BEGIN OF TASK INSTRUCTION]
+You are an expert in composing functions. You are given a question and a set of possible functions. 
+Based on the question, you will need to make one or more function/tool calls to achieve the purpose. 
+If none of the functions can be used, point it out and refuse to answer. 
+If the given question lacks the parameters required by the function, also point it out.
+[END OF TASK INSTRUCTION]
+
+[BEGIN OF AVAILABLE TOOLS]
+{{ $.Tools }}
+[END OF AVAILABLE TOOLS]
+
+[BEGIN OF FORMAT INSTRUCTION]
+The output MUST strictly adhere to the following JSON format, and NO other text MUST be included.
+The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make tool_calls an empty list '[]'.
+```
+{
+    "tool_calls": [
+    {"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},
+    ... (more tool calls as required)
+    ]
+}
+```
+[END OF FORMAT INSTRUCTION]
+
+[BEGIN OF QUERY]
+{{ .Content }}
+[END OF QUERY]
+
+
+{{ else }}
+{{ .Content }}
+{{ end }}
+{{- else if .ToolCalls }}### Response:
+{"tool_calls": [{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}{{ end }}]}
+<|EOT|>
+{{ else if eq .Role "assistant" }}### Response:
+{{ .Content }}
+<|EOT|>
+{{ end }}
+{{- end }}### Response:
\ No newline at end of file
diff --git a/server/testdata/tools/xlam.out b/server/testdata/tools/xlam.out
new file mode 100644
index 00000000..a4a9952f
--- /dev/null
+++ b/server/testdata/tools/xlam.out
@@ -0,0 +1,40 @@
+You are a knowledgable assistant. You can answer questions and perform tasks.
+### Instruction:
+What's the weather like today in Paris?
+### Response:
+{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}]}
+<|EOT|>
+### Response:
+The current temperature in Paris, France is 22 degrees Celsius.
+<|EOT|>
+### Instruction:
+[BEGIN OF TASK INSTRUCTION]
+You are an expert in composing functions. You are given a question and a set of possible functions. 
+Based on the question, you will need to make one or more function/tool calls to achieve the purpose. 
+If none of the functions can be used, point it out and refuse to answer. 
+If the given question lacks the parameters required by the function, also point it out.
+[END OF TASK INSTRUCTION]
+
+[BEGIN OF AVAILABLE TOOLS]
+[{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the users location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}]
+[END OF AVAILABLE TOOLS]
+
+[BEGIN OF FORMAT INSTRUCTION]
+The output MUST strictly adhere to the following JSON format, and NO other text MUST be included.
+The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make tool_calls an empty list '[]'.
+```
+{
+    "tool_calls": [
+    {"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},
+    ... (more tool calls as required)
+    ]
+}
+```
+[END OF FORMAT INSTRUCTION]
+
+[BEGIN OF QUERY]
+What's the weather like today in San Francisco and Toronto?
+[END OF QUERY]
+
+
+### Response:
\ No newline at end of file

From f8fedbda20b1b2531499ba64758642b0568b6f01 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Mon, 22 Jul 2024 12:42:00 -0400
Subject: [PATCH 06/79] Update llama.cpp submodule commit to `d94c6e0c` (#5805)

---
 llm/llama.cpp                                 |   2 +-
 llm/patches/05-default-pretokenizer.diff      |  10 +-
 ...{07-embeddings.diff => 06-embeddings.diff} |   0
 llm/patches/06-qwen2.diff                     |  13 -
 ...clip-unicode.diff => 07-clip-unicode.diff} |   0
 .../{09-pooling.diff => 08-pooling.diff}      |   0
 llm/patches/09-lora.diff                      | 360 ++++++++++++++++++
 llm/patches/10-tekken.diff                    |  43 ---
 llm/patches/11-embd_kv.diff                   |  19 -
 9 files changed, 366 insertions(+), 81 deletions(-)
 rename llm/patches/{07-embeddings.diff => 06-embeddings.diff} (100%)
 delete mode 100644 llm/patches/06-qwen2.diff
 rename llm/patches/{08-clip-unicode.diff => 07-clip-unicode.diff} (100%)
 rename llm/patches/{09-pooling.diff => 08-pooling.diff} (100%)
 create mode 100644 llm/patches/09-lora.diff
 delete mode 100644 llm/patches/10-tekken.diff
 delete mode 100644 llm/patches/11-embd_kv.diff

diff --git a/llm/llama.cpp b/llm/llama.cpp
index a8db2a9c..d94c6e0c 160000
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
@@ -1 +1 @@
-Subproject commit a8db2a9ce64cd4417f6a312ab61858f17f0f8584
+Subproject commit d94c6e0ccbd29ee1ba4f44e9caa8682ad94df9fa
diff --git a/llm/patches/05-default-pretokenizer.diff b/llm/patches/05-default-pretokenizer.diff
index 341a6f59..646bc49c 100644
--- a/llm/patches/05-default-pretokenizer.diff
+++ b/llm/patches/05-default-pretokenizer.diff
@@ -1,8 +1,8 @@
 diff --git a/src/llama.cpp b/src/llama.cpp
-index 2b9ace28..172640e2 100644
+index 8fe51971..7113ba64 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -5357,16 +5357,7 @@ static void llm_load_vocab(
+@@ -5433,16 +5433,7 @@ static void llm_load_vocab(
          if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
              vocab.tokenizer_add_space_prefix = false;
              vocab.tokenizer_clean_spaces = true;
@@ -20,9 +20,9 @@ index 2b9ace28..172640e2 100644
                  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
              } else if (
                      tokenizer_pre == "llama3"   ||
-@@ -5439,7 +5430,8 @@ static void llm_load_vocab(
-                 tokenizer_pre == "jais") {
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
+@@ -5526,7 +5517,8 @@ static void llm_load_vocab(
+                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
+                 vocab.tokenizer_clean_spaces = false;
              } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
diff --git a/llm/patches/07-embeddings.diff b/llm/patches/06-embeddings.diff
similarity index 100%
rename from llm/patches/07-embeddings.diff
rename to llm/patches/06-embeddings.diff
diff --git a/llm/patches/06-qwen2.diff b/llm/patches/06-qwen2.diff
deleted file mode 100644
index 1c7109f6..00000000
--- a/llm/patches/06-qwen2.diff
+++ /dev/null
@@ -1,13 +0,0 @@
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 40d2ec2c..f34eb79a 100644
---- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -6943,7 +6943,7 @@ static struct ggml_tensor * llm_build_kqv(
-         struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
-         cb(kq, "kq", il);
- 
--        if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
-+        if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) {
-             // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
-             // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
-             ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
diff --git a/llm/patches/08-clip-unicode.diff b/llm/patches/07-clip-unicode.diff
similarity index 100%
rename from llm/patches/08-clip-unicode.diff
rename to llm/patches/07-clip-unicode.diff
diff --git a/llm/patches/09-pooling.diff b/llm/patches/08-pooling.diff
similarity index 100%
rename from llm/patches/09-pooling.diff
rename to llm/patches/08-pooling.diff
diff --git a/llm/patches/09-lora.diff b/llm/patches/09-lora.diff
new file mode 100644
index 00000000..fc1017a6
--- /dev/null
+++ b/llm/patches/09-lora.diff
@@ -0,0 +1,360 @@
+diff --git a/common/common.cpp b/common/common.cpp
+index dbb724fb..c26fe6ee 100644
+--- a/common/common.cpp
++++ b/common/common.cpp
+@@ -2087,14 +2087,29 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
+     for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
+         const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
+         float lora_scale = std::get<1>(params.lora_adapter[i]);
++
++        // try to load as gguf
+         auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
+         if (adapter == nullptr) {
+-            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+-            llama_free(lctx);
+-            llama_free_model(model);
+-            return std::make_tuple(nullptr, nullptr);
++            fprintf(stderr, "%s: error: failed to apply lora adapter, trying ggla\n", __func__);
++
++            // if that fails, try loading as ggla for compatibility
++            int err = llama_model_apply_lora_from_file(model,
++                                                    lora_adapter.c_str(),
++                                                    lora_scale,
++                                                    ((i > 0) || params.lora_base.empty())
++                                                        ? NULL
++                                                        : params.lora_base.c_str(),
++                                                    params.n_threads);
++            if (err != 0) {
++                fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
++                llama_free(lctx);
++                llama_free_model(model);
++                return std::make_tuple(nullptr, nullptr);
++            }
++        } else {
++            llama_lora_adapter_set(lctx, adapter, lora_scale);
+         }
+-        llama_lora_adapter_set(lctx, adapter, lora_scale);
+     }
+ 
+     if (params.ignore_eos) {
+diff --git a/include/llama.h b/include/llama.h
+index 93fd77ca..b0fb37a6 100644
+--- a/include/llama.h
++++ b/include/llama.h
+@@ -1160,6 +1160,20 @@ extern "C" {
+ 
+     LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
+ 
++    // Apply a LoRA adapter to a loaded model
++    // path_base_model is the path to a higher quality model to use as a base for
++    // the layers modified by the adapter. Can be NULL to use the current loaded model.
++    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
++    // will be applied on top of the previous one
++    // Returns 0 on success
++    LLAMA_API int32_t llama_model_apply_lora_from_file(
++            const struct llama_model * model,
++                            const char * path_lora,
++                                float   scale,
++                            const char * path_base_model,
++                                int32_t   n_threads);
++
++
+ #ifdef __cplusplus
+ }
+ #endif
+diff --git a/src/llama.cpp b/src/llama.cpp
+index 80a0dd0f..9d7b0e17 100644
+--- a/src/llama.cpp
++++ b/src/llama.cpp
+@@ -21880,3 +21880,290 @@ static void llama_log_callback_default(ggml_log_level level, const char * text,
+     fputs(text, stderr);
+     fflush(stderr);
+ }
++
++static int llama_apply_lora_from_file_internal(
++    const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
++) {
++    LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
++
++    const int64_t t_start_lora_us = ggml_time_us();
++
++    llama_file fin(path_lora, "rb");
++
++    // verify magic and version
++    {
++        uint32_t magic = fin.read_u32();
++        if (magic != LLAMA_FILE_MAGIC_GGLA) {
++            LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
++            return 1;
++        }
++
++        uint32_t format_version = fin.read_u32();
++        if (format_version != 1) {
++            LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
++            return 1;
++        }
++    }
++
++    int32_t lora_r = fin.read_u32();
++    int32_t lora_alpha = fin.read_u32();
++    float scaling = scale * (float)lora_alpha / (float)lora_r;
++
++    LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
++
++    // load base model
++    std::unique_ptr<llama_model_loader> ml;
++    if (path_base_model) {
++        LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
++        ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
++        ml->init_mappings(/*prefetch*/ false); // no prefetching
++    }
++
++    struct tensor_meta {
++        std::string name;
++        ggml_type type;
++        int32_t ne[2];
++        size_t offset;
++    };
++    std::map<std::string, tensor_meta> tensor_meta_map;
++
++    // load all tensor meta
++    while (true) {
++        if (fin.tell() == fin.size) {
++            // eof
++            break;
++        }
++
++        int32_t n_dims;
++        int32_t name_len;
++        int32_t ftype;
++
++        fin.read_raw(&n_dims, sizeof(n_dims));
++        fin.read_raw(&name_len, sizeof(name_len));
++        fin.read_raw(&ftype, sizeof(ftype));
++
++        if (n_dims != 1 && n_dims != 2) {
++            LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
++            return 1;
++        }
++
++        int32_t ne[2] = { 1, 1 };
++        for (int i = 0; i < n_dims; ++i) {
++            fin.read_raw(&ne[i], sizeof(ne[i]));
++        }
++
++        std::string name;
++        {
++            GGML_ASSERT(name_len < GGML_MAX_NAME);
++            char buf[GGML_MAX_NAME];
++            fin.read_raw(buf, name_len);
++            name = std::string(buf, name_len);
++        }
++
++        // check for lora suffix
++        std::string lora_suffix;
++        if (name.length() > 6) {
++            lora_suffix = name.substr(name.length() - 6);
++        }
++        if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
++            LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
++            return 1;
++        }
++
++        // tensor type
++        ggml_type wtype;
++        switch (ftype) {
++            case 0: wtype = GGML_TYPE_F32;  break;
++            case 1: wtype = GGML_TYPE_F16;  break;
++            default:
++                    {
++                        LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
++                                __func__, ftype);
++                        return 1;
++                    }
++        }
++
++        // data offset
++        size_t offset = fin.tell();
++        offset = (offset + 31) & -32;
++
++        // skip tensor data
++        fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
++
++        tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
++    }
++
++    bool warned = false;
++    int n_tensors = 0;
++
++    // apply
++    ggml_backend_t backend_cpu = ggml_backend_cpu_init();
++    if (backend_cpu == nullptr) {
++        LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
++        return 1;
++    }
++    ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
++
++    std::vector<no_init<uint8_t>> read_buf;
++    for (const auto & it : model.tensors_by_name) {
++        const std::string & base_name = it.first;
++        ggml_tensor * model_t = it.second;
++
++        if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
++            tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
++            continue;
++        }
++
++        tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
++        tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
++
++        ggml_init_params lora_init_params = {
++            /* .mem_size   */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
++            /* .mem_buffer */ nullptr,
++            /* .no_alloc   */ true,
++        };
++        ggml_context * lora_ctx = ggml_init(lora_init_params);
++        if (lora_ctx == nullptr) {
++            LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
++            ggml_backend_free(backend_cpu);
++            return 1;
++        }
++
++        // create tensors
++        ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
++        ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
++        ggml_set_name(loraA, metaA.name.c_str());
++        ggml_set_name(loraB, metaB.name.c_str());
++
++        ggml_tensor * base_t;
++        if (ml) {
++            if (!ml->get_tensor_meta(base_name.c_str())) {
++                LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
++                return 1;
++            }
++            base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
++        } else {
++            base_t = ggml_dup_tensor(lora_ctx, model_t);
++        }
++        ggml_set_name(base_t, base_name.c_str());
++
++        // allocate in backend buffer
++        ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
++        if (lora_buf == nullptr) {
++            LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
++            return 1;
++        }
++
++        // load tensor data
++        auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
++            read_buf.resize(ggml_nbytes(tensor));
++            fin.seek(tensor_meta.offset, SEEK_SET);
++            fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
++            ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
++        };
++        load_tensor(metaA, loraA);
++        load_tensor(metaB, loraB);
++
++        // load base model tensor data
++        if (ml) {
++            ml->load_data_for(base_t);
++        } else {
++            ggml_backend_tensor_copy(model_t, base_t);
++        }
++
++        if (ggml_is_quantized(base_t->type) && !warned) {
++            LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
++                            "use a f16 or f32 base model with --lora-base\n", __func__);
++            warned = true;
++        }
++
++        if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
++            LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
++                            " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
++            ggml_free(lora_ctx);
++            ggml_backend_buffer_free(lora_buf);
++            ggml_backend_free(backend_cpu);
++            return 1;
++        }
++
++        auto build_lora_graph = [&]() {
++            // w = w + BA*s
++            ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
++            ggml_set_name(BA, "BA");
++
++            if (scaling != 1.0f) {
++                BA = ggml_scale(lora_ctx, BA, scaling);
++                ggml_set_name(BA, "BA_scaled");
++            }
++
++            ggml_tensor * r;
++            r = ggml_add_inplace(lora_ctx, base_t, BA);
++            ggml_set_name(r, "r_add");
++
++            if (base_t->type != model_t->type) {
++                // convert the result to the model type
++                r = ggml_cast(lora_ctx, r, model_t->type);
++                ggml_set_name(r, "r_cast");
++            }
++
++            return r;
++        };
++
++        ggml_cgraph * gf = ggml_new_graph(lora_ctx);
++        ggml_tensor * r = build_lora_graph();
++        ggml_build_forward_expand(gf, r);
++
++        ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
++        if (graph_buf == nullptr) {
++            LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
++            ggml_free(lora_ctx);
++            ggml_backend_buffer_free(lora_buf);
++            ggml_backend_free(backend_cpu);
++            return 1;
++        }
++
++        ggml_backend_graph_compute(backend_cpu, gf);
++
++        ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
++
++#if 0
++        // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
++        //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
++
++        // sched compute
++        ggml_build_forward_expand(gf, build_graph());
++        ggml_backend_sched_init_measure(sched, gf);
++
++        // create the graph again, since the previous one was destroyed by the measure
++        ggml_graph_clear(gf);
++        ggml_build_forward_expand(gf, build_graph());
++        ggml_backend_sched_graph_compute(sched, gf);
++        ggml_backend_sched_free(sched);
++#endif
++
++        ggml_backend_buffer_free(lora_buf);
++        ggml_backend_buffer_free(graph_buf);
++        ggml_free(lora_ctx);
++
++        n_tensors++;
++        if (n_tensors % 4 == 0) {
++            LLAMA_LOG_INFO(".");
++        }
++    }
++
++    ggml_backend_free(backend_cpu);
++
++    const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
++    LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
++
++    return 0;
++}
++
++int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
++    try {
++        return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
++    } catch (const std::exception & err) {
++        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
++        return 1;
++    }
++}
+\ No newline at end of file
diff --git a/llm/patches/10-tekken.diff b/llm/patches/10-tekken.diff
deleted file mode 100644
index 56a583e0..00000000
--- a/llm/patches/10-tekken.diff
+++ /dev/null
@@ -1,43 +0,0 @@
-diff --git a/include/llama.h b/include/llama.h
-index bb4b05ba..a92174e0 100644
---- a/include/llama.h
-+++ b/include/llama.h
-@@ -92,6 +92,7 @@ extern "C" {
-         LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 17,
-         LLAMA_VOCAB_PRE_TYPE_VIKING         = 18,
-         LLAMA_VOCAB_PRE_TYPE_JAIS           = 19,
-+        LLAMA_VOCAB_PRE_TYPE_TEKKEN         = 20,
-     };
- 
-     // note: these values should be synchronized with ggml_rope
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 18364976..435b6fe5 100644
---- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -5429,6 +5429,12 @@ static void llm_load_vocab(
-             } else if (
-                 tokenizer_pre == "jais") {
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
-+            } else if (
-+                tokenizer_pre == "tekken") {
-+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
-+                vocab.tokenizer_clean_spaces = false;
-+                vocab.tokenizer_ignore_merges = true;
-+                vocab.tokenizer_add_bos = true;
-             } else {
-                 LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
-@@ -15448,6 +15454,13 @@ struct llm_tokenizer_bpe {
-                     " ?[^(\\s|.,!?…。，、।۔،)]+",
-                 };
-                 break;
-+            case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
-+                    // original regex from tokenizer.json
-+                    // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
-+                regex_exprs = {
-+                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
-+                };
-+                break;
-             default:
-                 // default regex for BPE tokenization pre-processing
-                 regex_exprs = {
diff --git a/llm/patches/11-embd_kv.diff b/llm/patches/11-embd_kv.diff
deleted file mode 100644
index ad17a700..00000000
--- a/llm/patches/11-embd_kv.diff
+++ /dev/null
@@ -1,19 +0,0 @@
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 2b9ace28..e60d3d8d 100644
---- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -6052,10 +6052,10 @@ static bool llm_load_tensors(
- 
-                         layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
- 
--                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
--                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
--                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
--                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd,  n_embd_head_k * n_head});
-+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
-+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
-+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
- 
-                         // optional bias tensors
-                         layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);

From 35b89b2eaba4ac6fc4ae1ba4bf2ec6c8bafe9529 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 3 Jul 2024 16:00:54 -0700
Subject: [PATCH 07/79] rfc: dynamic environ lookup

---
 app/lifecycle/logging.go |  2 +-
 envconfig/config.go      | 28 ++++++++++++++++------------
 envconfig/config_test.go | 13 ++++++-------
 gpu/gpu.go               |  2 +-
 llm/memory_test.go       |  4 ++--
 llm/server.go            |  4 ++--
 server/routes.go         |  2 +-
 7 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/app/lifecycle/logging.go b/app/lifecycle/logging.go
index a8f1f7cd..3672aad5 100644
--- a/app/lifecycle/logging.go
+++ b/app/lifecycle/logging.go
@@ -14,7 +14,7 @@ import (
 func InitLogging() {
 	level := slog.LevelInfo
 
-	if envconfig.Debug {
+	if envconfig.Debug() {
 		level = slog.LevelDebug
 	}
 
diff --git a/envconfig/config.go b/envconfig/config.go
index 0abc6968..426507be 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -26,11 +26,24 @@ func (o OllamaHost) String() string {
 
 var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")
 
+// Debug returns true if the OLLAMA_DEBUG environment variable is set to a truthy value.
+func Debug() bool {
+	if s := clean("OLLAMA_DEBUG"); s != "" {
+		b, err := strconv.ParseBool(s)
+		if err != nil {
+			// non-empty value is truthy
+			return true
+		}
+
+		return b
+	}
+
+	return false
+}
+
 var (
 	// Set via OLLAMA_ORIGINS in the environment
 	AllowOrigins []string
-	// Set via OLLAMA_DEBUG in the environment
-	Debug bool
 	// Experimental flash attention
 	FlashAttention bool
 	// Set via OLLAMA_HOST in the environment
@@ -80,7 +93,7 @@ type EnvVar struct {
 
 func AsMap() map[string]EnvVar {
 	ret := map[string]EnvVar{
-		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug, "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
+		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
 		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention, "Enabled flash attention"},
 		"OLLAMA_HOST":              {"OLLAMA_HOST", Host, "IP Address for the ollama server (default 127.0.0.1:11434)"},
 		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"},
@@ -137,15 +150,6 @@ func init() {
 }
 
 func LoadConfig() {
-	if debug := clean("OLLAMA_DEBUG"); debug != "" {
-		d, err := strconv.ParseBool(debug)
-		if err == nil {
-			Debug = d
-		} else {
-			Debug = true
-		}
-	}
-
 	if fa := clean("OLLAMA_FLASH_ATTENTION"); fa != "" {
 		d, err := strconv.ParseBool(fa)
 		if err == nil {
diff --git a/envconfig/config_test.go b/envconfig/config_test.go
index a5d73fd7..f083bb03 100644
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -12,16 +12,15 @@ import (
 )
 
 func TestConfig(t *testing.T) {
-	Debug = false // Reset whatever was loaded in init()
 	t.Setenv("OLLAMA_DEBUG", "")
-	LoadConfig()
-	require.False(t, Debug)
+	require.False(t, Debug())
+
 	t.Setenv("OLLAMA_DEBUG", "false")
-	LoadConfig()
-	require.False(t, Debug)
+	require.False(t, Debug())
+
 	t.Setenv("OLLAMA_DEBUG", "1")
-	LoadConfig()
-	require.True(t, Debug)
+	require.True(t, Debug())
+
 	t.Setenv("OLLAMA_FLASH_ATTENTION", "1")
 	LoadConfig()
 	require.True(t, FlashAttention)
diff --git a/gpu/gpu.go b/gpu/gpu.go
index 6e25cb46..1815668f 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -611,7 +611,7 @@ func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
 }
 
 func getVerboseState() C.uint16_t {
-	if envconfig.Debug {
+	if envconfig.Debug() {
 		return C.uint16_t(1)
 	}
 	return C.uint16_t(0)
diff --git a/llm/memory_test.go b/llm/memory_test.go
index f972f927..06ae7438 100644
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -8,14 +8,14 @@ import (
 	"testing"
 
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/gpu"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
 
 func TestEstimateGPULayers(t *testing.T) {
-	envconfig.Debug = true
+	t.Setenv("OLLAMA_DEBUG", "1")
+
 	modelName := "dummy"
 	f, err := os.CreateTemp(t.TempDir(), modelName)
 	require.NoError(t, err)
diff --git a/llm/server.go b/llm/server.go
index 08463ef0..eb966650 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -195,7 +195,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
 	}
 
-	if envconfig.Debug {
+	if envconfig.Debug() {
 		params = append(params, "--verbose")
 	}
 
@@ -381,7 +381,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}
 
 		slog.Info("starting llama server", "cmd", s.cmd.String())
-		if envconfig.Debug {
+		if envconfig.Debug() {
 			filteredEnv := []string{}
 			for _, ev := range s.cmd.Env {
 				if strings.HasPrefix(ev, "CUDA_") ||
diff --git a/server/routes.go b/server/routes.go
index 0d7ca003..c049421b 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -1093,7 +1093,7 @@ func (s *Server) GenerateRoutes() http.Handler {
 
 func Serve(ln net.Listener) error {
 	level := slog.LevelInfo
-	if envconfig.Debug {
+	if envconfig.Debug() {
 		level = slog.LevelDebug
 	}
 

From 4f1afd575d1dfd803b0d9abb995862d61e8d0734 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 3 Jul 2024 16:44:57 -0700
Subject: [PATCH 08/79] host

---
 api/client.go            |   8 +--
 cmd/cmd.go               |   2 +-
 envconfig/config.go      | 107 ++++++++++++++++-----------------------
 envconfig/config_test.go |  62 +++++++++--------------
 4 files changed, 71 insertions(+), 108 deletions(-)

diff --git a/api/client.go b/api/client.go
index c59fbc42..e02b21bf 100644
--- a/api/client.go
+++ b/api/client.go
@@ -20,7 +20,6 @@ import (
 	"encoding/json"
 	"fmt"
 	"io"
-	"net"
 	"net/http"
 	"net/url"
 	"runtime"
@@ -63,13 +62,8 @@ func checkError(resp *http.Response, body []byte) error {
 // If the variable is not specified, a default ollama host and port will be
 // used.
 func ClientFromEnvironment() (*Client, error) {
-	ollamaHost := envconfig.Host
-
 	return &Client{
-		base: &url.URL{
-			Scheme: ollamaHost.Scheme,
-			Host:   net.JoinHostPort(ollamaHost.Host, ollamaHost.Port),
-		},
+		base: envconfig.Host(),
 		http: http.DefaultClient,
 	}, nil
 }
diff --git a/cmd/cmd.go b/cmd/cmd.go
index b761d018..5f3735f4 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1076,7 +1076,7 @@ func RunServer(cmd *cobra.Command, _ []string) error {
 		return err
 	}
 
-	ln, err := net.Listen("tcp", net.JoinHostPort(envconfig.Host.Host, envconfig.Host.Port))
+	ln, err := net.Listen("tcp", envconfig.Host().Host)
 	if err != nil {
 		return err
 	}
diff --git a/envconfig/config.go b/envconfig/config.go
index 426507be..23f93270 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -6,6 +6,7 @@ import (
 	"log/slog"
 	"math"
 	"net"
+	"net/url"
 	"os"
 	"path/filepath"
 	"runtime"
@@ -14,16 +15,6 @@ import (
 	"time"
 )
 
-type OllamaHost struct {
-	Scheme string
-	Host   string
-	Port   string
-}
-
-func (o OllamaHost) String() string {
-	return fmt.Sprintf("%s://%s:%s", o.Scheme, o.Host, o.Port)
-}
-
 var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")
 
 // Debug returns true if the OLLAMA_DEBUG environment variable is set to a truthy value.
@@ -41,13 +32,54 @@ func Debug() bool {
 	return false
 }
 
+// Host returns the scheme and host. Host can be configured via the OLLAMA_HOST environment variable.
+// Default is scheme "http" and host "127.0.0.1:11434"
+func Host() *url.URL {
+	defaultPort := "11434"
+
+	s := os.Getenv("OLLAMA_HOST")
+	s = strings.TrimSpace(strings.Trim(strings.TrimSpace(s), "\"'"))
+	scheme, hostport, ok := strings.Cut(s, "://")
+	switch {
+	case !ok:
+		scheme, hostport = "http", s
+	case scheme == "http":
+		defaultPort = "80"
+	case scheme == "https":
+		defaultPort = "443"
+	}
+
+	// trim trailing slashes
+	hostport = strings.TrimRight(hostport, "/")
+
+	host, port, err := net.SplitHostPort(hostport)
+	if err != nil {
+		host, port = "127.0.0.1", defaultPort
+		if ip := net.ParseIP(strings.Trim(hostport, "[]")); ip != nil {
+			host = ip.String()
+		} else if hostport != "" {
+			host = hostport
+		}
+	}
+
+	if n, err := strconv.ParseInt(port, 10, 32); err != nil || n > 65535 || n < 0 {
+		return &url.URL{
+			Scheme: scheme,
+			Host:   net.JoinHostPort(host, defaultPort),
+		}
+	}
+
+	return &url.URL{
+		Scheme: scheme,
+		Host:   net.JoinHostPort(host, port),
+	}
+}
+
 var (
 	// Set via OLLAMA_ORIGINS in the environment
 	AllowOrigins []string
 	// Experimental flash attention
 	FlashAttention bool
-	// Set via OLLAMA_HOST in the environment
-	Host *OllamaHost
 	// Set via OLLAMA_KEEP_ALIVE in the environment
 	KeepAlive time.Duration
 	// Set via OLLAMA_LLM_LIBRARY in the environment
@@ -95,7 +127,7 @@ func AsMap() map[string]EnvVar {
 	ret := map[string]EnvVar{
 		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
 		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention, "Enabled flash attention"},
-		"OLLAMA_HOST":              {"OLLAMA_HOST", Host, "IP Address for the ollama server (default 127.0.0.1:11434)"},
+		"OLLAMA_HOST":              {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
 		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"},
 		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
 		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
@@ -271,11 +303,6 @@ func LoadConfig() {
 		slog.Error("invalid setting", "OLLAMA_MODELS", ModelsDir, "error", err)
 	}
 
-	Host, err = getOllamaHost()
-	if err != nil {
-		slog.Error("invalid setting", "OLLAMA_HOST", Host, "error", err, "using default port", Host.Port)
-	}
-
 	if set, err := strconv.ParseBool(clean("OLLAMA_INTEL_GPU")); err == nil {
 		IntelGpu = set
 	}
@@ -298,50 +325,6 @@ func getModelsDir() (string, error) {
 	return filepath.Join(home, ".ollama", "models"), nil
 }
 
-func getOllamaHost() (*OllamaHost, error) {
-	defaultPort := "11434"
-
-	hostVar := os.Getenv("OLLAMA_HOST")
-	hostVar = strings.TrimSpace(strings.Trim(strings.TrimSpace(hostVar), "\"'"))
-
-	scheme, hostport, ok := strings.Cut(hostVar, "://")
-	switch {
-	case !ok:
-		scheme, hostport = "http", hostVar
-	case scheme == "http":
-		defaultPort = "80"
-	case scheme == "https":
-		defaultPort = "443"
-	}
-
-	// trim trailing slashes
-	hostport = strings.TrimRight(hostport, "/")
-
-	host, port, err := net.SplitHostPort(hostport)
-	if err != nil {
-		host, port = "127.0.0.1", defaultPort
-		if ip := net.ParseIP(strings.Trim(hostport, "[]")); ip != nil {
-			host = ip.String()
-		} else if hostport != "" {
-			host = hostport
-		}
-	}
-
-	if portNum, err := strconv.ParseInt(port, 10, 32); err != nil || portNum > 65535 || portNum < 0 {
-		return &OllamaHost{
-			Scheme: scheme,
-			Host:   host,
-			Port:   defaultPort,
-		}, ErrInvalidHostPort
-	}
-
-	return &OllamaHost{
-		Scheme: scheme,
-		Host:   host,
-		Port:   port,
-	}, nil
-}
-
 func loadKeepAlive(ka string) {
 	v, err := strconv.Atoi(ka)
 	if err != nil {
diff --git a/envconfig/config_test.go b/envconfig/config_test.go
index f083bb03..af89e7b7 100644
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -1,13 +1,10 @@
 package envconfig
 
 import (
-	"fmt"
 	"math"
-	"net"
 	"testing"
 	"time"
 
-	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
 
@@ -42,45 +39,34 @@ func TestConfig(t *testing.T) {
 }
 
 func TestClientFromEnvironment(t *testing.T) {
-	type testCase struct {
+	cases := map[string]struct {
 		value  string
 		expect string
-		err    error
+	}{
+		"empty":               {"", "127.0.0.1:11434"},
+		"only address":        {"1.2.3.4", "1.2.3.4:11434"},
+		"only port":           {":1234", ":1234"},
+		"address and port":    {"1.2.3.4:1234", "1.2.3.4:1234"},
+		"hostname":            {"example.com", "example.com:11434"},
+		"hostname and port":   {"example.com:1234", "example.com:1234"},
+		"zero port":           {":0", ":0"},
+		"too large port":      {":66000", ":11434"},
+		"too small port":      {":-1", ":11434"},
+		"ipv6 localhost":      {"[::1]", "[::1]:11434"},
+		"ipv6 world open":     {"[::]", "[::]:11434"},
+		"ipv6 no brackets":    {"::1", "[::1]:11434"},
+		"ipv6 + port":         {"[::1]:1337", "[::1]:1337"},
+		"extra space":         {" 1.2.3.4 ", "1.2.3.4:11434"},
+		"extra quotes":        {"\"1.2.3.4\"", "1.2.3.4:11434"},
+		"extra space+quotes":  {" \" 1.2.3.4 \" ", "1.2.3.4:11434"},
+		"extra single quotes": {"'1.2.3.4'", "1.2.3.4:11434"},
 	}
 
-	hostTestCases := map[string]*testCase{
-		"empty":               {value: "", expect: "127.0.0.1:11434"},
-		"only address":        {value: "1.2.3.4", expect: "1.2.3.4:11434"},
-		"only port":           {value: ":1234", expect: ":1234"},
-		"address and port":    {value: "1.2.3.4:1234", expect: "1.2.3.4:1234"},
-		"hostname":            {value: "example.com", expect: "example.com:11434"},
-		"hostname and port":   {value: "example.com:1234", expect: "example.com:1234"},
-		"zero port":           {value: ":0", expect: ":0"},
-		"too large port":      {value: ":66000", err: ErrInvalidHostPort},
-		"too small port":      {value: ":-1", err: ErrInvalidHostPort},
-		"ipv6 localhost":      {value: "[::1]", expect: "[::1]:11434"},
-		"ipv6 world open":     {value: "[::]", expect: "[::]:11434"},
-		"ipv6 no brackets":    {value: "::1", expect: "[::1]:11434"},
-		"ipv6 + port":         {value: "[::1]:1337", expect: "[::1]:1337"},
-		"extra space":         {value: " 1.2.3.4 ", expect: "1.2.3.4:11434"},
-		"extra quotes":        {value: "\"1.2.3.4\"", expect: "1.2.3.4:11434"},
-		"extra space+quotes":  {value: " \" 1.2.3.4 \" ", expect: "1.2.3.4:11434"},
-		"extra single quotes": {value: "'1.2.3.4'", expect: "1.2.3.4:11434"},
-	}
-
-	for k, v := range hostTestCases {
-		t.Run(k, func(t *testing.T) {
-			t.Setenv("OLLAMA_HOST", v.value)
-			LoadConfig()
-
-			oh, err := getOllamaHost()
-			if err != v.err {
-				t.Fatalf("expected %s, got %s", v.err, err)
-			}
-
-			if err == nil {
-				host := net.JoinHostPort(oh.Host, oh.Port)
-				assert.Equal(t, v.expect, host, fmt.Sprintf("%s: expected %s, got %s", k, v.expect, host))
+	for name, tt := range cases {
+		t.Run(name, func(t *testing.T) {
+			t.Setenv("OLLAMA_HOST", tt.value)
+			if host := Host(); host.Host != tt.expect {
+				t.Errorf("%s: expected %s, got %s", name, tt.expect, host.Host)
 			}
 		})
 	}

From d1a5227cadf6ae736f8dd5cb9fb7452dd015f820 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 3 Jul 2024 17:02:07 -0700
Subject: [PATCH 09/79] origins

---
 envconfig/config.go      | 52 +++++++++++-----------
 envconfig/config_test.go | 95 +++++++++++++++++++++++++++++++++++++++-
 server/routes.go         |  2 +-
 3 files changed, 119 insertions(+), 30 deletions(-)

diff --git a/envconfig/config.go b/envconfig/config.go
index 23f93270..7ae521ab 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -75,9 +75,31 @@ func Host() *url.URL {
 	}
 }
 
+// Origins returns a list of allowed origins. Origins can be configured via the OLLAMA_ORIGINS environment variable.
+func Origins() (origins []string) {
+	if s := clean("OLLAMA_ORIGINS"); s != "" {
+		origins = strings.Split(s, ",")
+	}
+
+	for _, origin := range []string{"localhost", "127.0.0.1", "0.0.0.0"} {
+		origins = append(origins,
+			fmt.Sprintf("http://%s", origin),
+			fmt.Sprintf("https://%s", origin),
+			fmt.Sprintf("http://%s", net.JoinHostPort(origin, "*")),
+			fmt.Sprintf("https://%s", net.JoinHostPort(origin, "*")),
+		)
+	}
+
+	origins = append(origins,
+		"app://*",
+		"file://*",
+		"tauri://*",
+	)
+
+	return origins
+}
+
 var (
-	// Set via OLLAMA_ORIGINS in the environment
-	AllowOrigins []string
 	// Experimental flash attention
 	FlashAttention bool
 	// Set via OLLAMA_KEEP_ALIVE in the environment
@@ -136,7 +158,7 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_NOHISTORY":         {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
 		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
 		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"},
-		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"},
+		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
 		"OLLAMA_RUNNERS_DIR":       {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
 		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
@@ -160,12 +182,6 @@ func Values() map[string]string {
 	return vals
 }
 
-var defaultAllowOrigins = []string{
-	"localhost",
-	"127.0.0.1",
-	"0.0.0.0",
-}
-
 // Clean quotes and spaces from the value
 func clean(key string) string {
 	return strings.Trim(os.Getenv(key), "\"' ")
@@ -255,24 +271,6 @@ func LoadConfig() {
 		NoPrune = true
 	}
 
-	if origins := clean("OLLAMA_ORIGINS"); origins != "" {
-		AllowOrigins = strings.Split(origins, ",")
-	}
-	for _, allowOrigin := range defaultAllowOrigins {
-		AllowOrigins = append(AllowOrigins,
-			fmt.Sprintf("http://%s", allowOrigin),
-			fmt.Sprintf("https://%s", allowOrigin),
-			fmt.Sprintf("http://%s", net.JoinHostPort(allowOrigin, "*")),
-			fmt.Sprintf("https://%s", net.JoinHostPort(allowOrigin, "*")),
-		)
-	}
-
-	AllowOrigins = append(AllowOrigins,
-		"app://*",
-		"file://*",
-		"tauri://*",
-	)
-
 	maxRunners := clean("OLLAMA_MAX_LOADED_MODELS")
 	if maxRunners != "" {
 		m, err := strconv.Atoi(maxRunners)
diff --git a/envconfig/config_test.go b/envconfig/config_test.go
index af89e7b7..dc65ef70 100644
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -5,10 +5,11 @@ import (
 	"testing"
 	"time"
 
+	"github.com/google/go-cmp/cmp"
 	"github.com/stretchr/testify/require"
 )
 
-func TestConfig(t *testing.T) {
+func TestSmoke(t *testing.T) {
 	t.Setenv("OLLAMA_DEBUG", "")
 	require.False(t, Debug())
 
@@ -38,7 +39,7 @@ func TestConfig(t *testing.T) {
 	require.Equal(t, time.Duration(math.MaxInt64), KeepAlive)
 }
 
-func TestClientFromEnvironment(t *testing.T) {
+func TestHost(t *testing.T) {
 	cases := map[string]struct {
 		value  string
 		expect string
@@ -71,3 +72,93 @@ func TestClientFromEnvironment(t *testing.T) {
 		})
 	}
 }
+
+func TestOrigins(t *testing.T) {
+	cases := []struct {
+		value  string
+		expect []string
+	}{
+		{"", []string{
+			"http://localhost",
+			"https://localhost",
+			"http://localhost:*",
+			"https://localhost:*",
+			"http://127.0.0.1",
+			"https://127.0.0.1",
+			"http://127.0.0.1:*",
+			"https://127.0.0.1:*",
+			"http://0.0.0.0",
+			"https://0.0.0.0",
+			"http://0.0.0.0:*",
+			"https://0.0.0.0:*",
+			"app://*",
+			"file://*",
+			"tauri://*",
+		}},
+		{"http://10.0.0.1", []string{
+			"http://10.0.0.1",
+			"http://localhost",
+			"https://localhost",
+			"http://localhost:*",
+			"https://localhost:*",
+			"http://127.0.0.1",
+			"https://127.0.0.1",
+			"http://127.0.0.1:*",
+			"https://127.0.0.1:*",
+			"http://0.0.0.0",
+			"https://0.0.0.0",
+			"http://0.0.0.0:*",
+			"https://0.0.0.0:*",
+			"app://*",
+			"file://*",
+			"tauri://*",
+		}},
+		{"http://172.16.0.1,https://192.168.0.1", []string{
+			"http://172.16.0.1",
+			"https://192.168.0.1",
+			"http://localhost",
+			"https://localhost",
+			"http://localhost:*",
+			"https://localhost:*",
+			"http://127.0.0.1",
+			"https://127.0.0.1",
+			"http://127.0.0.1:*",
+			"https://127.0.0.1:*",
+			"http://0.0.0.0",
+			"https://0.0.0.0",
+			"http://0.0.0.0:*",
+			"https://0.0.0.0:*",
+			"app://*",
+			"file://*",
+			"tauri://*",
+		}},
+		{"http://totally.safe,http://definitely.legit", []string{
+			"http://totally.safe",
+			"http://definitely.legit",
+			"http://localhost",
+			"https://localhost",
+			"http://localhost:*",
+			"https://localhost:*",
+			"http://127.0.0.1",
+			"https://127.0.0.1",
+			"http://127.0.0.1:*",
+			"https://127.0.0.1:*",
+			"http://0.0.0.0",
+			"https://0.0.0.0",
+			"http://0.0.0.0:*",
+			"https://0.0.0.0:*",
+			"app://*",
+			"file://*",
+			"tauri://*",
+		}},
+	}
+	for _, tt := range cases {
+		t.Run(tt.value, func(t *testing.T) {
+			t.Setenv("OLLAMA_ORIGINS", tt.value)
+
+			if diff := cmp.Diff(Origins(), tt.expect); diff != "" {
+				t.Errorf("%s: mismatch (-want +got):\n%s", tt.value, diff)
+			}
+		})
+	}
+}
diff --git a/server/routes.go b/server/routes.go
index c049421b..07898d9b 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -1048,7 +1048,7 @@ func (s *Server) GenerateRoutes() http.Handler {
 	for _, prop := range openAIProperties {
 		config.AllowHeaders = append(config.AllowHeaders, "x-stainless-"+prop)
 	}
-	config.AllowOrigins = envconfig.AllowOrigins
+	config.AllowOrigins = envconfig.Origins()
 
 	r := gin.Default()
 	r.Use(

From 66fe77f0841622054e29f5fd3d3643f514991004 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 3 Jul 2024 17:07:42 -0700
Subject: [PATCH 10/79] models

---
 envconfig/config.go | 34 ++++++++++++++++------------------
 server/modelpath.go | 12 +++---------
 2 files changed, 19 insertions(+), 27 deletions(-)

diff --git a/envconfig/config.go b/envconfig/config.go
index 7ae521ab..286f51d4 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -99,6 +99,21 @@ func Origins() (origins []string) {
 	return origins
 }
 
+// Models returns the path to the models directory. Models directory can be configured via the OLLAMA_MODELS environment variable.
+// Default is $HOME/.ollama/models
+func Models() string {
+	if s, ok := os.LookupEnv("OLLAMA_MODELS"); ok {
+		return s
+	}
+
+	home, err := os.UserHomeDir()
+	if err != nil {
+		panic(err)
+	}
+
+	return filepath.Join(home, ".ollama", "models")
+}
+
 var (
 	// Experimental flash attention
 	FlashAttention bool
@@ -154,7 +169,7 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
 		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
 		"OLLAMA_MAX_QUEUE":         {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
-		"OLLAMA_MODELS":            {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"},
+		"OLLAMA_MODELS":            {"OLLAMA_MODELS", Models(), "The path to the models directory"},
 		"OLLAMA_NOHISTORY":         {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
 		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
 		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"},
@@ -295,12 +310,6 @@ func LoadConfig() {
 		loadKeepAlive(ka)
 	}
 
-	var err error
-	ModelsDir, err = getModelsDir()
-	if err != nil {
-		slog.Error("invalid setting", "OLLAMA_MODELS", ModelsDir, "error", err)
-	}
-
 	if set, err := strconv.ParseBool(clean("OLLAMA_INTEL_GPU")); err == nil {
 		IntelGpu = set
 	}
@@ -312,17 +321,6 @@ func LoadConfig() {
 	HsaOverrideGfxVersion = clean("HSA_OVERRIDE_GFX_VERSION")
 }
 
-func getModelsDir() (string, error) {
-	if models, exists := os.LookupEnv("OLLAMA_MODELS"); exists {
-		return models, nil
-	}
-	home, err := os.UserHomeDir()
-	if err != nil {
-		return "", err
-	}
-	return filepath.Join(home, ".ollama", "models"), nil
-}
-
 func loadKeepAlive(ka string) {
 	v, err := strconv.Atoi(ka)
 	if err != nil {
diff --git a/server/modelpath.go b/server/modelpath.go
index 3fdb4238..354eeed7 100644
--- a/server/modelpath.go
+++ b/server/modelpath.go
@@ -105,9 +105,7 @@ func (mp ModelPath) GetShortTagname() string {
 
 // GetManifestPath returns the path to the manifest file for the given model path, it is up to the caller to create the directory if it does not exist.
 func (mp ModelPath) GetManifestPath() (string, error) {
-	dir := envconfig.ModelsDir
-
-	return filepath.Join(dir, "manifests", mp.Registry, mp.Namespace, mp.Repository, mp.Tag), nil
+	return filepath.Join(envconfig.Models(), "manifests", mp.Registry, mp.Namespace, mp.Repository, mp.Tag), nil
 }
 
 func (mp ModelPath) BaseURL() *url.URL {
@@ -118,9 +116,7 @@ func (mp ModelPath) BaseURL() *url.URL {
 }
 
 func GetManifestPath() (string, error) {
-	dir := envconfig.ModelsDir
-
-	path := filepath.Join(dir, "manifests")
+	path := filepath.Join(envconfig.Models(), "manifests")
 	if err := os.MkdirAll(path, 0o755); err != nil {
 		return "", err
 	}
@@ -129,8 +125,6 @@ func GetManifestPath() (string, error) {
 }
 
 func GetBlobsPath(digest string) (string, error) {
-	dir := envconfig.ModelsDir
-
 	// only accept actual sha256 digests
 	pattern := "^sha256[:-][0-9a-fA-F]{64}$"
 	re := regexp.MustCompile(pattern)
@@ -140,7 +134,7 @@ func GetBlobsPath(digest string) (string, error) {
 	}
 
 	digest = strings.ReplaceAll(digest, ":", "-")
-	path := filepath.Join(dir, "blobs", digest)
+	path := filepath.Join(envconfig.Models(), "blobs", digest)
 	dirPath := filepath.Dir(path)
 	if digest == "" {
 		dirPath = path

From 55cd3ddccac14d48f5f129ec35b3a109be215d01 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 3 Jul 2024 17:22:13 -0700
Subject: [PATCH 11/79] bool

---
 cmd/interactive.go       |   2 +-
 envconfig/config.go      | 123 ++++++++++++++++-----------------------
 envconfig/config_test.go |  28 ++++++++-
 gpu/gpu.go               |   2 +-
 llm/server.go            |   2 +-
 server/images.go         |   4 +-
 server/routes.go         |   2 +-
 server/sched.go          |   2 +-
 8 files changed, 82 insertions(+), 83 deletions(-)

diff --git a/cmd/interactive.go b/cmd/interactive.go
index adbc3e9f..9fb66851 100644
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -157,7 +157,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		return err
 	}
 
-	if envconfig.NoHistory {
+	if envconfig.NoHistory() {
 		scanner.HistoryDisable()
 	}
 
diff --git a/envconfig/config.go b/envconfig/config.go
index 286f51d4..ea78585b 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -17,21 +17,6 @@ import (
 
 var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")
 
-// Debug returns true if the OLLAMA_DEBUG environment variable is set to a truthy value.
-func Debug() bool {
-	if s := clean("OLLAMA_DEBUG"); s != "" {
-		b, err := strconv.ParseBool(s)
-		if err != nil {
-			// non-empty value is truthy
-			return true
-		}
-
-		return b
-	}
-
-	return false
-}
-
 // Host returns the scheme and host. Host can be configured via the OLLAMA_HOST environment variable.
 // Default is scheme "http" and host "127.0.0.1:11434"
 func Host() *url.URL {
@@ -77,7 +62,7 @@ func Host() *url.URL {
 
 // Origins returns a list of allowed origins. Origins can be configured via the OLLAMA_ORIGINS environment variable.
 func Origins() (origins []string) {
-	if s := clean("OLLAMA_ORIGINS"); s != "" {
+	if s := getenv("OLLAMA_ORIGINS"); s != "" {
 		origins = strings.Split(s, ",")
 	}
 
@@ -114,9 +99,37 @@ func Models() string {
 	return filepath.Join(home, ".ollama", "models")
 }
 
+func Bool(k string) func() bool {
+	return func() bool {
+		if s := getenv(k); s != "" {
+			b, err := strconv.ParseBool(s)
+			if err != nil {
+				return true
+			}
+
+			return b
+		}
+
+		return false
+	}
+}
+
+var (
+	// Debug enabled additional debug information.
+	Debug = Bool("OLLAMA_DEBUG")
+	// FlashAttention enables the experimental flash attention feature.
+	FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
+	// NoHistory disables readline history.
+	NoHistory = Bool("OLLAMA_NOHISTORY")
+	// NoPrune disables pruning of model blobs on startup.
+	NoPrune = Bool("OLLAMA_NOPRUNE")
+	// SchedSpread allows scheduling models across all GPUs.
+	SchedSpread = Bool("OLLAMA_SCHED_SPREAD")
+	// IntelGPU enables experimental Intel GPU detection.
+	IntelGPU = Bool("OLLAMA_INTEL_GPU")
+)
+
 var (
-	// Experimental flash attention
-	FlashAttention bool
 	// Set via OLLAMA_KEEP_ALIVE in the environment
 	KeepAlive time.Duration
 	// Set via OLLAMA_LLM_LIBRARY in the environment
@@ -125,22 +138,12 @@ var (
 	MaxRunners int
 	// Set via OLLAMA_MAX_QUEUE in the environment
 	MaxQueuedRequests int
-	// Set via OLLAMA_MODELS in the environment
-	ModelsDir string
-	// Set via OLLAMA_NOHISTORY in the environment
-	NoHistory bool
-	// Set via OLLAMA_NOPRUNE in the environment
-	NoPrune bool
 	// Set via OLLAMA_NUM_PARALLEL in the environment
 	NumParallel int
 	// Set via OLLAMA_RUNNERS_DIR in the environment
 	RunnersDir string
-	// Set via OLLAMA_SCHED_SPREAD in the environment
-	SchedSpread bool
 	// Set via OLLAMA_TMPDIR in the environment
 	TmpDir string
-	// Set via OLLAMA_INTEL_GPU in the environment
-	IntelGpu bool
 
 	// Set via CUDA_VISIBLE_DEVICES in the environment
 	CudaVisibleDevices string
@@ -163,19 +166,19 @@ type EnvVar struct {
 func AsMap() map[string]EnvVar {
 	ret := map[string]EnvVar{
 		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
-		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention, "Enabled flash attention"},
+		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
 		"OLLAMA_HOST":              {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
 		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"},
 		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
 		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
 		"OLLAMA_MAX_QUEUE":         {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
 		"OLLAMA_MODELS":            {"OLLAMA_MODELS", Models(), "The path to the models directory"},
-		"OLLAMA_NOHISTORY":         {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
-		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
+		"OLLAMA_NOHISTORY":         {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"},
+		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
 		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"},
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
 		"OLLAMA_RUNNERS_DIR":       {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
-		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
+		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
 		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
 	}
 	if runtime.GOOS != "darwin" {
@@ -184,7 +187,7 @@ func AsMap() map[string]EnvVar {
 		ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"}
 		ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"}
 		ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"}
-		ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGpu, "Enable experimental Intel GPU detection"}
+		ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
 	}
 	return ret
 }
@@ -197,8 +200,8 @@ func Values() map[string]string {
 	return vals
 }
 
-// Clean quotes and spaces from the value
-func clean(key string) string {
+// getenv returns an environment variable stripped of leading and trailing quotes or spaces
+func getenv(key string) string {
 	return strings.Trim(os.Getenv(key), "\"' ")
 }
 
@@ -213,14 +216,7 @@ func init() {
 }
 
 func LoadConfig() {
-	if fa := clean("OLLAMA_FLASH_ATTENTION"); fa != "" {
-		d, err := strconv.ParseBool(fa)
-		if err == nil {
-			FlashAttention = d
-		}
-	}
-
-	RunnersDir = clean("OLLAMA_RUNNERS_DIR")
+	RunnersDir = getenv("OLLAMA_RUNNERS_DIR")
 	if runtime.GOOS == "windows" && RunnersDir == "" {
 		// On Windows we do not carry the payloads inside the main executable
 		appExe, err := os.Executable()
@@ -256,11 +252,11 @@ func LoadConfig() {
 		}
 	}
 
-	TmpDir = clean("OLLAMA_TMPDIR")
+	TmpDir = getenv("OLLAMA_TMPDIR")
 
-	LLMLibrary = clean("OLLAMA_LLM_LIBRARY")
+	LLMLibrary = getenv("OLLAMA_LLM_LIBRARY")
 
-	if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" {
+	if onp := getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
 		val, err := strconv.Atoi(onp)
 		if err != nil {
 			slog.Error("invalid setting, ignoring", "OLLAMA_NUM_PARALLEL", onp, "error", err)
@@ -269,24 +265,7 @@ func LoadConfig() {
 		}
 	}
 
-	if nohistory := clean("OLLAMA_NOHISTORY"); nohistory != "" {
-		NoHistory = true
-	}
-
-	if spread := clean("OLLAMA_SCHED_SPREAD"); spread != "" {
-		s, err := strconv.ParseBool(spread)
-		if err == nil {
-			SchedSpread = s
-		} else {
-			SchedSpread = true
-		}
-	}
-
-	if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
-		NoPrune = true
-	}
-
-	maxRunners := clean("OLLAMA_MAX_LOADED_MODELS")
+	maxRunners := getenv("OLLAMA_MAX_LOADED_MODELS")
 	if maxRunners != "" {
 		m, err := strconv.Atoi(maxRunners)
 		if err != nil {
@@ -305,20 +284,16 @@ func LoadConfig() {
 		}
 	}
 
-	ka := clean("OLLAMA_KEEP_ALIVE")
+	ka := getenv("OLLAMA_KEEP_ALIVE")
 	if ka != "" {
 		loadKeepAlive(ka)
 	}
 
-	if set, err := strconv.ParseBool(clean("OLLAMA_INTEL_GPU")); err == nil {
-		IntelGpu = set
-	}
-
-	CudaVisibleDevices = clean("CUDA_VISIBLE_DEVICES")
-	HipVisibleDevices = clean("HIP_VISIBLE_DEVICES")
-	RocrVisibleDevices = clean("ROCR_VISIBLE_DEVICES")
-	GpuDeviceOrdinal = clean("GPU_DEVICE_ORDINAL")
-	HsaOverrideGfxVersion = clean("HSA_OVERRIDE_GFX_VERSION")
+	CudaVisibleDevices = getenv("CUDA_VISIBLE_DEVICES")
+	HipVisibleDevices = getenv("HIP_VISIBLE_DEVICES")
+	RocrVisibleDevices = getenv("ROCR_VISIBLE_DEVICES")
+	GpuDeviceOrdinal = getenv("GPU_DEVICE_ORDINAL")
+	HsaOverrideGfxVersion = getenv("HSA_OVERRIDE_GFX_VERSION")
 }
 
 func loadKeepAlive(ka string) {
diff --git a/envconfig/config_test.go b/envconfig/config_test.go
index dc65ef70..b364b009 100644
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -20,8 +20,8 @@ func TestSmoke(t *testing.T) {
 	require.True(t, Debug())
 
 	t.Setenv("OLLAMA_FLASH_ATTENTION", "1")
-	LoadConfig()
-	require.True(t, FlashAttention)
+	require.True(t, FlashAttention())
+
 	t.Setenv("OLLAMA_KEEP_ALIVE", "")
 	LoadConfig()
 	require.Equal(t, 5*time.Minute, KeepAlive)
@@ -162,3 +162,27 @@ func TestOrigins(t *testing.T) {
 		})
 	}
 }
+
+func TestBool(t *testing.T) {
+	cases := map[string]struct {
+		value  string
+		expect bool
+	}{
+		"empty":     {"", false},
+		"true":      {"true", true},
+		"false":     {"false", false},
+		"1":         {"1", true},
+		"0":         {"0", false},
+		"random":    {"random", true},
+		"something": {"something", true},
+	}
+
+	for name, tt := range cases {
+		t.Run(name, func(t *testing.T) {
+			t.Setenv("OLLAMA_BOOL", tt.value)
+			if b := Bool("OLLAMA_BOOL"); b() != tt.expect {
+				t.Errorf("%s: expected %t, got %t", name, tt.expect, b())
+			}
+		})
+	}
+}
diff --git a/gpu/gpu.go b/gpu/gpu.go
index 1815668f..c3059542 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -302,7 +302,7 @@ func GetGPUInfo() GpuInfoList {
 		}
 
 		// Intel
-		if envconfig.IntelGpu {
+		if envconfig.IntelGPU() {
 			oHandles = initOneAPIHandles()
 			// On windows we bundle the oneapi library one level above the runner dir
 			depPath = ""
diff --git a/llm/server.go b/llm/server.go
index eb966650..84d9e93a 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -221,7 +221,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--memory-f32")
 	}
 
-	flashAttnEnabled := envconfig.FlashAttention
+	flashAttnEnabled := envconfig.FlashAttention()
 
 	for _, g := range gpus {
 		// only cuda (compute capability 7+) and metal support flash attention
diff --git a/server/images.go b/server/images.go
index 574dec19..3eb3b3fa 100644
--- a/server/images.go
+++ b/server/images.go
@@ -644,7 +644,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 		return err
 	}
 
-	if !envconfig.NoPrune && old != nil {
+	if !envconfig.NoPrune() && old != nil {
 		if err := old.RemoveLayers(); err != nil {
 			return err
 		}
@@ -883,7 +883,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 	// build deleteMap to prune unused layers
 	deleteMap := make(map[string]struct{})
 
-	if !envconfig.NoPrune {
+	if !envconfig.NoPrune() {
 		manifest, _, err = GetManifest(mp)
 		if err != nil && !errors.Is(err, os.ErrNotExist) {
 			return err
diff --git a/server/routes.go b/server/routes.go
index 07898d9b..41a73cb4 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -1121,7 +1121,7 @@ func Serve(ln net.Listener) error {
 		return err
 	}
 
-	if !envconfig.NoPrune {
+	if !envconfig.NoPrune() {
 		// clean up unused layers and manifests
 		if err := PruneLayers(); err != nil {
 			return err
diff --git a/server/sched.go b/server/sched.go
index 2daed3ab..e1e986a5 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -695,7 +695,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
 		// First attempt to fit the model into a single GPU
 		for _, p := range numParallelToTry {
 			req.opts.NumCtx = req.origNumCtx * p
-			if !envconfig.SchedSpread {
+			if !envconfig.SchedSpread() {
 				for _, g := range sgl {
 					if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
 						slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))

From 8570c1c0ef73e89448f6724645f56b9b10efef44 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 3 Jul 2024 18:39:35 -0700
Subject: [PATCH 12/79] keepalive

---
 envconfig/config.go      | 51 +++++++++++++++++-----------------------
 envconfig/config_test.go | 49 +++++++++++++++++++++++++-------------
 server/sched.go          |  2 +-
 3 files changed, 55 insertions(+), 47 deletions(-)

diff --git a/envconfig/config.go b/envconfig/config.go
index ea78585b..62bfad64 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -99,6 +99,26 @@ func Models() string {
 	return filepath.Join(home, ".ollama", "models")
 }
 
+// KeepAlive returns the duration that models stay loaded in memory. KeepAlive can be configured via the OLLAMA_KEEP_ALIVE environment variable.
+// Negative values are treated as infinite. Zero is treated as no keep alive.
+// Default is 5 minutes.
+func KeepAlive() (keepAlive time.Duration) {
+	keepAlive = 5 * time.Minute
+	if s := os.Getenv("OLLAMA_KEEP_ALIVE"); s != "" {
+		if d, err := time.ParseDuration(s); err == nil {
+			keepAlive = d
+		} else if n, err := strconv.ParseInt(s, 10, 64); err == nil {
+			keepAlive = time.Duration(n) * time.Second
+		}
+	}
+
+	if keepAlive < 0 {
+		return time.Duration(math.MaxInt64)
+	}
+
+	return keepAlive
+}
+
 func Bool(k string) func() bool {
 	return func() bool {
 		if s := getenv(k); s != "" {
@@ -130,8 +150,6 @@ var (
 )
 
 var (
-	// Set via OLLAMA_KEEP_ALIVE in the environment
-	KeepAlive time.Duration
 	// Set via OLLAMA_LLM_LIBRARY in the environment
 	LLMLibrary string
 	// Set via OLLAMA_MAX_LOADED_MODELS in the environment
@@ -168,7 +186,7 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
 		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
 		"OLLAMA_HOST":              {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
-		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"},
+		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
 		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
 		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
 		"OLLAMA_MAX_QUEUE":         {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
@@ -210,7 +228,6 @@ func init() {
 	NumParallel = 0 // Autoselect
 	MaxRunners = 0  // Autoselect
 	MaxQueuedRequests = 512
-	KeepAlive = 5 * time.Minute
 
 	LoadConfig()
 }
@@ -284,35 +301,9 @@ func LoadConfig() {
 		}
 	}
 
-	ka := getenv("OLLAMA_KEEP_ALIVE")
-	if ka != "" {
-		loadKeepAlive(ka)
-	}
-
 	CudaVisibleDevices = getenv("CUDA_VISIBLE_DEVICES")
 	HipVisibleDevices = getenv("HIP_VISIBLE_DEVICES")
 	RocrVisibleDevices = getenv("ROCR_VISIBLE_DEVICES")
 	GpuDeviceOrdinal = getenv("GPU_DEVICE_ORDINAL")
 	HsaOverrideGfxVersion = getenv("HSA_OVERRIDE_GFX_VERSION")
 }
-
-func loadKeepAlive(ka string) {
-	v, err := strconv.Atoi(ka)
-	if err != nil {
-		d, err := time.ParseDuration(ka)
-		if err == nil {
-			if d < 0 {
-				KeepAlive = time.Duration(math.MaxInt64)
-			} else {
-				KeepAlive = d
-			}
-		}
-	} else {
-		d := time.Duration(v) * time.Second
-		if d < 0 {
-			KeepAlive = time.Duration(math.MaxInt64)
-		} else {
-			KeepAlive = d
-		}
-	}
-}
diff --git a/envconfig/config_test.go b/envconfig/config_test.go
index b364b009..87c808ca 100644
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -21,22 +21,6 @@ func TestSmoke(t *testing.T) {
 
 	t.Setenv("OLLAMA_FLASH_ATTENTION", "1")
 	require.True(t, FlashAttention())
-
-	t.Setenv("OLLAMA_KEEP_ALIVE", "")
-	LoadConfig()
-	require.Equal(t, 5*time.Minute, KeepAlive)
-	t.Setenv("OLLAMA_KEEP_ALIVE", "3")
-	LoadConfig()
-	require.Equal(t, 3*time.Second, KeepAlive)
-	t.Setenv("OLLAMA_KEEP_ALIVE", "1h")
-	LoadConfig()
-	require.Equal(t, 1*time.Hour, KeepAlive)
-	t.Setenv("OLLAMA_KEEP_ALIVE", "-1s")
-	LoadConfig()
-	require.Equal(t, time.Duration(math.MaxInt64), KeepAlive)
-	t.Setenv("OLLAMA_KEEP_ALIVE", "-1")
-	LoadConfig()
-	require.Equal(t, time.Duration(math.MaxInt64), KeepAlive)
 }
 
 func TestHost(t *testing.T) {
@@ -186,3 +170,36 @@ func TestBool(t *testing.T) {
 		})
 	}
 }
+
+func TestKeepAlive(t *testing.T) {
+	cases := map[string]time.Duration{
+		"":       5 * time.Minute,
+		"1s":     time.Second,
+		"1m":     time.Minute,
+		"1h":     time.Hour,
+		"5m0s":   5 * time.Minute,
+		"1h2m3s": 1*time.Hour + 2*time.Minute + 3*time.Second,
+		"0":      time.Duration(0),
+		"60":     60 * time.Second,
+		"120":    2 * time.Minute,
+		"3600":   time.Hour,
+		"-0":     time.Duration(0),
+		"-1":     time.Duration(math.MaxInt64),
+		"-1m":    time.Duration(math.MaxInt64),
+		// invalid values
+		" ":   5 * time.Minute,
+		"???": 5 * time.Minute,
+		"1d":  5 * time.Minute,
+		"1y":  5 * time.Minute,
+		"1w":  5 * time.Minute,
+	}
+
+	for tt, expect := range cases {
+		t.Run(tt, func(t *testing.T) {
+			t.Setenv("OLLAMA_KEEP_ALIVE", tt)
+			if actual := KeepAlive(); actual != expect {
+				t.Errorf("%s: expected %s, got %s", tt, expect, actual)
+			}
+		})
+	}
+}
diff --git a/server/sched.go b/server/sched.go
index e1e986a5..ad40c4ef 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -401,7 +401,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
 	if numParallel < 1 {
 		numParallel = 1
 	}
-	sessionDuration := envconfig.KeepAlive
+	sessionDuration := envconfig.KeepAlive()
 	if req.sessionDuration != nil {
 		sessionDuration = req.sessionDuration.Duration
 	}

From e2c3f6b3e2de014656ab9ddffccf7b89d1bcc09e Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 3 Jul 2024 19:30:19 -0700
Subject: [PATCH 13/79] string

---
 envconfig/config.go | 143 ++++++++++++++++++++++----------------------
 gpu/amd_linux.go    |   8 +--
 gpu/amd_windows.go  |   2 +-
 gpu/assets.go       |   6 +-
 gpu/gpu.go          |   8 +--
 llm/server.go       |   2 +-
 6 files changed, 85 insertions(+), 84 deletions(-)

diff --git a/envconfig/config.go b/envconfig/config.go
index 62bfad64..34cc4dac 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -149,30 +149,77 @@ var (
 	IntelGPU = Bool("OLLAMA_INTEL_GPU")
 )
 
+func String(s string) func() string {
+	return func() string {
+		return getenv(s)
+	}
+}
+
+var (
+	LLMLibrary = String("OLLAMA_LLM_LIBRARY")
+	TmpDir     = String("OLLAMA_TMPDIR")
+
+	CudaVisibleDevices    = String("CUDA_VISIBLE_DEVICES")
+	HipVisibleDevices     = String("HIP_VISIBLE_DEVICES")
+	RocrVisibleDevices    = String("ROCR_VISIBLE_DEVICES")
+	GpuDeviceOrdinal      = String("GPU_DEVICE_ORDINAL")
+	HsaOverrideGfxVersion = String("HSA_OVERRIDE_GFX_VERSION")
+)
+
+func RunnersDir() (p string) {
+	if p := getenv("OLLAMA_RUNNERS_DIR"); p != "" {
+		return p
+	}
+
+	if runtime.GOOS != "windows" {
+		return
+	}
+
+	defer func() {
+		if p == "" {
+			slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
+		}
+	}()
+
+	// On Windows we do not carry the payloads inside the main executable
+	exe, err := os.Executable()
+	if err != nil {
+		return
+	}
+
+	cwd, err := os.Getwd()
+	if err != nil {
+		return
+	}
+
+	var paths []string
+	for _, root := range []string{filepath.Dir(exe), cwd} {
+		paths = append(paths,
+			root,
+			filepath.Join(root, "windows-"+runtime.GOARCH),
+			filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
+		)
+	}
+
+	// Try a few variations to improve developer experience when building from source in the local tree
+	for _, path := range paths {
+		candidate := filepath.Join(path, "ollama_runners")
+		if _, err := os.Stat(candidate); err == nil {
+			p = candidate
+			break
+		}
+	}
+
+	return p
+}
+
 var (
-	// Set via OLLAMA_LLM_LIBRARY in the environment
-	LLMLibrary string
 	// Set via OLLAMA_MAX_LOADED_MODELS in the environment
 	MaxRunners int
 	// Set via OLLAMA_MAX_QUEUE in the environment
 	MaxQueuedRequests int
 	// Set via OLLAMA_NUM_PARALLEL in the environment
 	NumParallel int
-	// Set via OLLAMA_RUNNERS_DIR in the environment
-	RunnersDir string
-	// Set via OLLAMA_TMPDIR in the environment
-	TmpDir string
-
-	// Set via CUDA_VISIBLE_DEVICES in the environment
-	CudaVisibleDevices string
-	// Set via HIP_VISIBLE_DEVICES in the environment
-	HipVisibleDevices string
-	// Set via ROCR_VISIBLE_DEVICES in the environment
-	RocrVisibleDevices string
-	// Set via GPU_DEVICE_ORDINAL in the environment
-	GpuDeviceOrdinal string
-	// Set via HSA_OVERRIDE_GFX_VERSION in the environment
-	HsaOverrideGfxVersion string
 )
 
 type EnvVar struct {
@@ -187,7 +234,7 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
 		"OLLAMA_HOST":              {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
 		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
-		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
+		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
 		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
 		"OLLAMA_MAX_QUEUE":         {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
 		"OLLAMA_MODELS":            {"OLLAMA_MODELS", Models(), "The path to the models directory"},
@@ -195,16 +242,16 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
 		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"},
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
-		"OLLAMA_RUNNERS_DIR":       {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
+		"OLLAMA_RUNNERS_DIR":       {"OLLAMA_RUNNERS_DIR", RunnersDir(), "Location for runners"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
-		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
+		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"},
 	}
 	if runtime.GOOS != "darwin" {
-		ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices, "Set which NVIDIA devices are visible"}
-		ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices, "Set which AMD devices are visible"}
-		ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"}
-		ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"}
-		ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"}
+		ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"}
+		ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible"}
+		ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices(), "Set which AMD devices are visible"}
+		ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal(), "Set which AMD devices are visible"}
+		ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
 		ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
 	}
 	return ret
@@ -233,46 +280,6 @@ func init() {
 }
 
 func LoadConfig() {
-	RunnersDir = getenv("OLLAMA_RUNNERS_DIR")
-	if runtime.GOOS == "windows" && RunnersDir == "" {
-		// On Windows we do not carry the payloads inside the main executable
-		appExe, err := os.Executable()
-		if err != nil {
-			slog.Error("failed to lookup executable path", "error", err)
-		}
-
-		cwd, err := os.Getwd()
-		if err != nil {
-			slog.Error("failed to lookup working directory", "error", err)
-		}
-
-		var paths []string
-		for _, root := range []string{filepath.Dir(appExe), cwd} {
-			paths = append(paths,
-				root,
-				filepath.Join(root, "windows-"+runtime.GOARCH),
-				filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
-			)
-		}
-
-		// Try a few variations to improve developer experience when building from source in the local tree
-		for _, p := range paths {
-			candidate := filepath.Join(p, "ollama_runners")
-			_, err := os.Stat(candidate)
-			if err == nil {
-				RunnersDir = candidate
-				break
-			}
-		}
-		if RunnersDir == "" {
-			slog.Error("unable to locate llm runner directory.  Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
-		}
-	}
-
-	TmpDir = getenv("OLLAMA_TMPDIR")
-
-	LLMLibrary = getenv("OLLAMA_LLM_LIBRARY")
-
 	if onp := getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
 		val, err := strconv.Atoi(onp)
 		if err != nil {
@@ -300,10 +307,4 @@ func LoadConfig() {
 			MaxQueuedRequests = p
 		}
 	}
-
-	CudaVisibleDevices = getenv("CUDA_VISIBLE_DEVICES")
-	HipVisibleDevices = getenv("HIP_VISIBLE_DEVICES")
-	RocrVisibleDevices = getenv("ROCR_VISIBLE_DEVICES")
-	GpuDeviceOrdinal = getenv("GPU_DEVICE_ORDINAL")
-	HsaOverrideGfxVersion = getenv("HSA_OVERRIDE_GFX_VERSION")
 }
diff --git a/gpu/amd_linux.go b/gpu/amd_linux.go
index 15b6fc61..33dd03ab 100644
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -60,9 +60,9 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
 	// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
 	var visibleDevices []string
-	hipVD := envconfig.HipVisibleDevices   // zero based index only
-	rocrVD := envconfig.RocrVisibleDevices // zero based index or UUID, but consumer cards seem to not support UUID
-	gpuDO := envconfig.GpuDeviceOrdinal    // zero based index
+	hipVD := envconfig.HipVisibleDevices()   // zero based index only
+	rocrVD := envconfig.RocrVisibleDevices() // zero based index or UUID, but consumer cards seem to not support UUID
+	gpuDO := envconfig.GpuDeviceOrdinal()    // zero based index
 	switch {
 	// TODO is this priorty order right?
 	case hipVD != "":
@@ -75,7 +75,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 		visibleDevices = strings.Split(gpuDO, ",")
 	}
 
-	gfxOverride := envconfig.HsaOverrideGfxVersion
+	gfxOverride := envconfig.HsaOverrideGfxVersion()
 	var supported []string
 	libDir := ""
 
diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go
index 20aed447..a170dfdc 100644
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -53,7 +53,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 	}
 
 	var supported []string
-	gfxOverride := envconfig.HsaOverrideGfxVersion
+	gfxOverride := envconfig.HsaOverrideGfxVersion()
 	if gfxOverride == "" {
 		supported, err = GetSupportedGFX(libDir)
 		if err != nil {
diff --git a/gpu/assets.go b/gpu/assets.go
index 073d2e81..39ff7c21 100644
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -26,7 +26,7 @@ func PayloadsDir() (string, error) {
 	defer lock.Unlock()
 	var err error
 	if payloadsDir == "" {
-		runnersDir := envconfig.RunnersDir
+		runnersDir := envconfig.RunnersDir()
 
 		if runnersDir != "" {
 			payloadsDir = runnersDir
@@ -35,7 +35,7 @@ func PayloadsDir() (string, error) {
 
 		// The remainder only applies on non-windows where we still carry payloads in the main executable
 		cleanupTmpDirs()
-		tmpDir := envconfig.TmpDir
+		tmpDir := envconfig.TmpDir()
 		if tmpDir == "" {
 			tmpDir, err = os.MkdirTemp("", "ollama")
 			if err != nil {
@@ -105,7 +105,7 @@ func cleanupTmpDirs() {
 func Cleanup() {
 	lock.Lock()
 	defer lock.Unlock()
-	runnersDir := envconfig.RunnersDir
+	runnersDir := envconfig.RunnersDir()
 	if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
 		// We want to fully clean up the tmpdir parent of the payloads dir
 		tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
diff --git a/gpu/gpu.go b/gpu/gpu.go
index c3059542..acab1c8d 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -230,8 +230,8 @@ func GetGPUInfo() GpuInfoList {
 
 		// On windows we bundle the nvidia library one level above the runner dir
 		depPath := ""
-		if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
-			depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "cuda")
+		if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
+			depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "cuda")
 		}
 
 		// Load ALL libraries
@@ -306,8 +306,8 @@ func GetGPUInfo() GpuInfoList {
 			oHandles = initOneAPIHandles()
 			// On windows we bundle the oneapi library one level above the runner dir
 			depPath = ""
-			if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
-				depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "oneapi")
+			if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
+				depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "oneapi")
 			}
 
 			for d := range oHandles.oneapi.num_drivers {
diff --git a/llm/server.go b/llm/server.go
index 84d9e93a..0741d386 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -163,7 +163,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	} else {
 		servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
 	}
-	demandLib := envconfig.LLMLibrary
+	demandLib := envconfig.LLMLibrary()
 	if demandLib != "" {
 		serverPath := availableServers[demandLib]
 		if serverPath == "" {

From 0f1910129f0a73c469ce2c012d39c8d98b79ef80 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 3 Jul 2024 19:41:17 -0700
Subject: [PATCH 14/79] int

---
 envconfig/config.go           | 66 ++++++++++-------------------------
 integration/basic_test.go     |  9 +----
 integration/max_queue_test.go | 14 ++++----
 server/sched.go               | 23 +++++++-----
 server/sched_test.go          |  7 ++--
 5 files changed, 42 insertions(+), 77 deletions(-)

diff --git a/envconfig/config.go b/envconfig/config.go
index 34cc4dac..01abea42 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -213,13 +213,22 @@ func RunnersDir() (p string) {
 	return p
 }
 
+func Int(k string, n int) func() int {
+	return func() int {
+		if s := getenv(k); s != "" {
+			if n, err := strconv.ParseInt(s, 10, 64); err == nil && n >= 0 {
+				return int(n)
+			}
+		}
+
+		return n
+	}
+}
+
 var (
-	// Set via OLLAMA_MAX_LOADED_MODELS in the environment
-	MaxRunners int
-	// Set via OLLAMA_MAX_QUEUE in the environment
-	MaxQueuedRequests int
-	// Set via OLLAMA_NUM_PARALLEL in the environment
-	NumParallel int
+	NumParallel = Int("OLLAMA_NUM_PARALLEL", 0)
+	MaxRunners  = Int("OLLAMA_MAX_LOADED_MODELS", 0)
+	MaxQueue    = Int("OLLAMA_MAX_QUEUE", 512)
 )
 
 type EnvVar struct {
@@ -235,12 +244,12 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_HOST":              {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
 		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
 		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
-		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
-		"OLLAMA_MAX_QUEUE":         {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
+		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"},
+		"OLLAMA_MAX_QUEUE":         {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
 		"OLLAMA_MODELS":            {"OLLAMA_MODELS", Models(), "The path to the models directory"},
 		"OLLAMA_NOHISTORY":         {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"},
 		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
-		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"},
+		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
 		"OLLAMA_RUNNERS_DIR":       {"OLLAMA_RUNNERS_DIR", RunnersDir(), "Location for runners"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
@@ -269,42 +278,3 @@ func Values() map[string]string {
 func getenv(key string) string {
 	return strings.Trim(os.Getenv(key), "\"' ")
 }
-
-func init() {
-	// default values
-	NumParallel = 0 // Autoselect
-	MaxRunners = 0  // Autoselect
-	MaxQueuedRequests = 512
-
-	LoadConfig()
-}
-
-func LoadConfig() {
-	if onp := getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
-		val, err := strconv.Atoi(onp)
-		if err != nil {
-			slog.Error("invalid setting, ignoring", "OLLAMA_NUM_PARALLEL", onp, "error", err)
-		} else {
-			NumParallel = val
-		}
-	}
-
-	maxRunners := getenv("OLLAMA_MAX_LOADED_MODELS")
-	if maxRunners != "" {
-		m, err := strconv.Atoi(maxRunners)
-		if err != nil {
-			slog.Error("invalid setting, ignoring", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err)
-		} else {
-			MaxRunners = m
-		}
-	}
-
-	if onp := os.Getenv("OLLAMA_MAX_QUEUE"); onp != "" {
-		p, err := strconv.Atoi(onp)
-		if err != nil || p <= 0 {
-			slog.Error("invalid setting, ignoring", "OLLAMA_MAX_QUEUE", onp, "error", err)
-		} else {
-			MaxQueuedRequests = p
-		}
-	}
-}
diff --git a/integration/basic_test.go b/integration/basic_test.go
index 6e632a1c..8e35b5c5 100644
--- a/integration/basic_test.go
+++ b/integration/basic_test.go
@@ -45,14 +45,7 @@ func TestUnicodeModelDir(t *testing.T) {
 	defer os.RemoveAll(modelDir)
 	slog.Info("unicode", "OLLAMA_MODELS", modelDir)
 
-	oldModelsDir := os.Getenv("OLLAMA_MODELS")
-	if oldModelsDir == "" {
-		defer os.Unsetenv("OLLAMA_MODELS")
-	} else {
-		defer os.Setenv("OLLAMA_MODELS", oldModelsDir)
-	}
-	err = os.Setenv("OLLAMA_MODELS", modelDir)
-	require.NoError(t, err)
+	t.Setenv("OLLAMA_MODELS", modelDir)
 
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
diff --git a/integration/max_queue_test.go b/integration/max_queue_test.go
index dfa5eae0..b06197e1 100644
--- a/integration/max_queue_test.go
+++ b/integration/max_queue_test.go
@@ -5,7 +5,6 @@ package integration
 import (
 	"context"
 	"errors"
-	"fmt"
 	"log/slog"
 	"os"
 	"strconv"
@@ -14,8 +13,10 @@ import (
 	"testing"
 	"time"
 
-	"github.com/ollama/ollama/api"
 	"github.com/stretchr/testify/require"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/envconfig"
 )
 
 func TestMaxQueue(t *testing.T) {
@@ -27,13 +28,10 @@ func TestMaxQueue(t *testing.T) {
 	// Note: This test can be quite slow when running in CPU mode, so keep the threadCount low unless your on GPU
 	// Also note that by default Darwin can't sustain > ~128 connections without adjusting limits
 	threadCount := 32
-	mq := os.Getenv("OLLAMA_MAX_QUEUE")
-	if mq != "" {
-		var err error
-		threadCount, err = strconv.Atoi(mq)
-		require.NoError(t, err)
+	if maxQueue := envconfig.MaxQueue(); maxQueue != 0 {
+		threadCount = maxQueue
 	} else {
-		os.Setenv("OLLAMA_MAX_QUEUE", fmt.Sprintf("%d", threadCount))
+		t.Setenv("OLLAMA_MAX_QUEUE", strconv.Itoa(threadCount))
 	}
 
 	req := api.GenerateRequest{
diff --git a/server/sched.go b/server/sched.go
index ad40c4ef..610a2c50 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -5,9 +5,11 @@ import (
 	"errors"
 	"fmt"
 	"log/slog"
+	"os"
 	"reflect"
 	"runtime"
 	"sort"
+	"strconv"
 	"strings"
 	"sync"
 	"time"
@@ -59,11 +61,12 @@ var defaultParallel = 4
 var ErrMaxQueue = fmt.Errorf("server busy, please try again.  maximum pending requests exceeded")
 
 func InitScheduler(ctx context.Context) *Scheduler {
+	maxQueue := envconfig.MaxQueue()
 	sched := &Scheduler{
-		pendingReqCh:  make(chan *LlmRequest, envconfig.MaxQueuedRequests),
-		finishedReqCh: make(chan *LlmRequest, envconfig.MaxQueuedRequests),
-		expiredCh:     make(chan *runnerRef, envconfig.MaxQueuedRequests),
-		unloadedCh:    make(chan interface{}, envconfig.MaxQueuedRequests),
+		pendingReqCh:  make(chan *LlmRequest, maxQueue),
+		finishedReqCh: make(chan *LlmRequest, maxQueue),
+		expiredCh:     make(chan *runnerRef, maxQueue),
+		unloadedCh:    make(chan interface{}, maxQueue),
 		loaded:        make(map[string]*runnerRef),
 		newServerFn:   llm.NewLlamaServer,
 		getGpuFn:      gpu.GetGPUInfo,
@@ -126,7 +129,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 				slog.Debug("pending request cancelled or timed out, skipping scheduling")
 				continue
 			}
-			numParallel := envconfig.NumParallel
+			numParallel := envconfig.NumParallel()
 			// TODO (jmorganca): multimodal models don't support parallel yet
 			// see https://github.com/ollama/ollama/issues/4165
 			if len(pending.model.ProjectorPaths) > 0 && numParallel != 1 {
@@ -148,7 +151,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						pending.useLoadedRunner(runner, s.finishedReqCh)
 						break
 					}
-				} else if envconfig.MaxRunners > 0 && loadedCount >= envconfig.MaxRunners {
+				} else if envconfig.MaxRunners() > 0 && loadedCount >= envconfig.MaxRunners() {
 					slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
 					runnerToExpire = s.findRunnerToUnload()
 				} else {
@@ -161,7 +164,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						gpus = s.getGpuFn()
 					}
 
-					if envconfig.MaxRunners <= 0 {
+					if envconfig.MaxRunners() <= 0 {
 						// No user specified MaxRunners, so figure out what automatic setting to use
 						// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
 						// if any GPU has unreliable free memory reporting, 1x the number of GPUs
@@ -173,11 +176,13 @@ func (s *Scheduler) processPending(ctx context.Context) {
 							}
 						}
 						if allReliable {
-							envconfig.MaxRunners = defaultModelsPerGPU * len(gpus)
+							// HACK
+							os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(defaultModelsPerGPU*len(gpus)))
 							slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", envconfig.MaxRunners, "gpu_count", len(gpus))
 						} else {
+							// HACK
+							os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(len(gpus)))
 							slog.Info("one or more GPUs detected that are unable to accurately report free memory - disabling default concurrency")
-							envconfig.MaxRunners = len(gpus)
 						}
 					}
 
diff --git a/server/sched_test.go b/server/sched_test.go
index 9ddd1fab..3166ff66 100644
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -12,7 +12,6 @@ import (
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/app/lifecycle"
-	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llm"
@@ -272,7 +271,7 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {
 	c.req.opts.NumGPU = 0                                       // CPU load, will be allowed
 	d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded
 
-	envconfig.MaxRunners = 1
+	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
 	s.newServerFn = a.newServer
 	slog.Info("a")
 	s.pendingReqCh <- a.req
@@ -291,7 +290,7 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {
 	require.Len(t, s.loaded, 1)
 	s.loadedMu.Unlock()
 
-	envconfig.MaxRunners = 0
+	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
 	s.newServerFn = b.newServer
 	slog.Info("b")
 	s.pendingReqCh <- b.req
@@ -362,7 +361,7 @@ func TestGetRunner(t *testing.T) {
 	a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond})
 	b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond})
 	c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond})
-	envconfig.MaxQueuedRequests = 1
+	t.Setenv("OLLAMA_MAX_QUEUE", "1")
 	s := InitScheduler(ctx)
 	s.getGpuFn = getGpuFn
 	s.getCpuFn = getCpuFn

From 1954ec5917bf81ac743ba19bf0e7a6da47766778 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 3 Jul 2024 19:43:17 -0700
Subject: [PATCH 15/79] uint64

---
 api/client_test.go              |  3 --
 integration/concurrency_test.go | 70 +++++++++++++++++----------------
 server/manifest_test.go         |  2 -
 server/modelpath_test.go        |  3 --
 server/routes_create_test.go    | 10 -----
 server/routes_delete_test.go    |  2 -
 server/routes_list_test.go      |  2 -
 server/routes_test.go           |  4 --
 8 files changed, 37 insertions(+), 59 deletions(-)

diff --git a/api/client_test.go b/api/client_test.go
index fe9fd74f..23fe9334 100644
--- a/api/client_test.go
+++ b/api/client_test.go
@@ -2,8 +2,6 @@ package api
 
 import (
 	"testing"
-
-	"github.com/ollama/ollama/envconfig"
 )
 
 func TestClientFromEnvironment(t *testing.T) {
@@ -33,7 +31,6 @@ func TestClientFromEnvironment(t *testing.T) {
 	for k, v := range testCases {
 		t.Run(k, func(t *testing.T) {
 			t.Setenv("OLLAMA_HOST", v.value)
-			envconfig.LoadConfig()
 
 			client, err := ClientFromEnvironment()
 			if err != v.err {
diff --git a/integration/concurrency_test.go b/integration/concurrency_test.go
index 8593285b..81d0b587 100644
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -5,14 +5,16 @@ package integration
 import (
 	"context"
 	"log/slog"
-	"os"
 	"strconv"
 	"sync"
 	"testing"
 	"time"
 
-	"github.com/ollama/ollama/api"
 	"github.com/stretchr/testify/require"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/format"
 )
 
 func TestMultiModelConcurrency(t *testing.T) {
@@ -106,13 +108,16 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
 
 // Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
 func TestMultiModelStress(t *testing.T) {
-	vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
-	if vram == "" {
+	s := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
+	if s == "" {
 		t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
 	}
-	max, err := strconv.ParseUint(vram, 10, 64)
-	require.NoError(t, err)
-	const MB = uint64(1024 * 1024)
+
+	maxVram, err := strconv.ParseUint(s, 10, 64)
+	if err != nil {
+		t.Fatal(err)
+	}
+
 	type model struct {
 		name string
 		size uint64 // Approximate amount of VRAM they typically use when fully loaded in VRAM
@@ -121,83 +126,82 @@ func TestMultiModelStress(t *testing.T) {
 	smallModels := []model{
 		{
 			name: "orca-mini",
-			size: 2992 * MB,
+			size: 2992 * format.MebiByte,
 		},
 		{
 			name: "phi",
-			size: 2616 * MB,
+			size: 2616 * format.MebiByte,
 		},
 		{
 			name: "gemma:2b",
-			size: 2364 * MB,
+			size: 2364 * format.MebiByte,
 		},
 		{
 			name: "stable-code:3b",
-			size: 2608 * MB,
+			size: 2608 * format.MebiByte,
 		},
 		{
 			name: "starcoder2:3b",
-			size: 2166 * MB,
+			size: 2166 * format.MebiByte,
 		},
 	}
 	mediumModels := []model{
 		{
 			name: "llama2",
-			size: 5118 * MB,
+			size: 5118 * format.MebiByte,
 		},
 		{
 			name: "mistral",
-			size: 4620 * MB,
+			size: 4620 * format.MebiByte,
 		},
 		{
 			name: "orca-mini:7b",
-			size: 5118 * MB,
+			size: 5118 * format.MebiByte,
 		},
 		{
 			name: "dolphin-mistral",
-			size: 4620 * MB,
+			size: 4620 * format.MebiByte,
 		},
 		{
 			name: "gemma:7b",
-			size: 5000 * MB,
+			size: 5000 * format.MebiByte,
+		},
+		{
+			name: "codellama:7b",
+			size: 5118 * format.MebiByte,
 		},
-		// TODO - uncomment this once #3565 is merged and this is rebased on it
-		// {
-		// 	name: "codellama:7b",
-		// 	size: 5118 * MB,
-		// },
 	}
 
 	// These seem to be too slow to be useful...
 	// largeModels := []model{
 	// 	{
 	// 		name: "llama2:13b",
-	// 		size: 7400 * MB,
+	// 		size: 7400 * format.MebiByte,
 	// 	},
 	// 	{
 	// 		name: "codellama:13b",
-	// 		size: 7400 * MB,
+	// 		size: 7400 * format.MebiByte,
 	// 	},
 	// 	{
 	// 		name: "orca-mini:13b",
-	// 		size: 7400 * MB,
+	// 		size: 7400 * format.MebiByte,
 	// 	},
 	// 	{
 	// 		name: "gemma:7b",
-	// 		size: 5000 * MB,
+	// 		size: 5000 * format.MebiByte,
 	// 	},
 	// 	{
 	// 		name: "starcoder2:15b",
-	// 		size: 9100 * MB,
+	// 		size: 9100 * format.MebiByte,
 	// 	},
 	// }
 
 	var chosenModels []model
 	switch {
-	case max < 10000*MB:
+	case maxVram < 10000*format.MebiByte:
 		slog.Info("selecting small models")
 		chosenModels = smallModels
-	// case max < 30000*MB:
+	// case maxVram < 30000*format.MebiByte:
 	default:
 		slog.Info("selecting medium models")
 		chosenModels = mediumModels
@@ -226,15 +230,15 @@ func TestMultiModelStress(t *testing.T) {
 	}
 
 	var wg sync.WaitGroup
-	consumed := uint64(256 * MB) // Assume some baseline usage
+	consumed := uint64(256 * format.MebiByte) // Assume some baseline usage
 	for i := 0; i < len(req); i++ {
 		// Always get at least 2 models, but dont' overshoot VRAM too much or we'll take too long
-		if i > 1 && consumed > max {
-			slog.Info("achieved target vram exhaustion", "count", i, "vramMB", max/1024/1024, "modelsMB", consumed/1024/1024)
+		if i > 1 && consumed > vram {
+			slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(vram), "models", format.HumanBytes2(consumed))
 			break
 		}
 		consumed += chosenModels[i].size
-		slog.Info("target vram", "count", i, "vramMB", max/1024/1024, "modelsMB", consumed/1024/1024)
+		slog.Info("target vram", "count", i, "vram", format.HumanBytes2(vram), "models", format.HumanBytes2(consumed))
 
 		wg.Add(1)
 		go func(i int) {
diff --git a/server/manifest_test.go b/server/manifest_test.go
index ca6c3d2e..a4af5d5e 100644
--- a/server/manifest_test.go
+++ b/server/manifest_test.go
@@ -7,7 +7,6 @@ import (
 	"slices"
 	"testing"
 
-	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/types/model"
 )
 
@@ -108,7 +107,6 @@ func TestManifests(t *testing.T) {
 		t.Run(n, func(t *testing.T) {
 			d := t.TempDir()
 			t.Setenv("OLLAMA_MODELS", d)
-			envconfig.LoadConfig()
 
 			for _, p := range wants.ps {
 				createManifest(t, d, p)
diff --git a/server/modelpath_test.go b/server/modelpath_test.go
index 6c4dfbee..849e0fa7 100644
--- a/server/modelpath_test.go
+++ b/server/modelpath_test.go
@@ -7,8 +7,6 @@ import (
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
-
-	"github.com/ollama/ollama/envconfig"
 )
 
 func TestGetBlobsPath(t *testing.T) {
@@ -63,7 +61,6 @@ func TestGetBlobsPath(t *testing.T) {
 	for _, tc := range tests {
 		t.Run(tc.name, func(t *testing.T) {
 			t.Setenv("OLLAMA_MODELS", dir)
-			envconfig.LoadConfig()
 
 			got, err := GetBlobsPath(tc.digest)
 
diff --git a/server/routes_create_test.go b/server/routes_create_test.go
index 3234ea5e..c853a9e9 100644
--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@@ -15,7 +15,6 @@ import (
 
 	"github.com/gin-gonic/gin"
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/llm"
 )
 
@@ -89,7 +88,6 @@ func TestCreateFromBin(t *testing.T) {
 
 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
-	envconfig.LoadConfig()
 
 	var s Server
 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -117,7 +115,6 @@ func TestCreateFromModel(t *testing.T) {
 
 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
-	envconfig.LoadConfig()
 	var s Server
 
 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -160,7 +157,6 @@ func TestCreateRemovesLayers(t *testing.T) {
 
 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
-	envconfig.LoadConfig()
 	var s Server
 
 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -209,7 +205,6 @@ func TestCreateUnsetsSystem(t *testing.T) {
 
 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
-	envconfig.LoadConfig()
 	var s Server
 
 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -267,7 +262,6 @@ func TestCreateMergeParameters(t *testing.T) {
 
 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
-	envconfig.LoadConfig()
 	var s Server
 
 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -372,7 +366,6 @@ func TestCreateReplacesMessages(t *testing.T) {
 
 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
-	envconfig.LoadConfig()
 	var s Server
 
 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -450,7 +443,6 @@ func TestCreateTemplateSystem(t *testing.T) {
 
 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
-	envconfig.LoadConfig()
 	var s Server
 
 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -534,7 +526,6 @@ func TestCreateLicenses(t *testing.T) {
 
 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
-	envconfig.LoadConfig()
 	var s Server
 
 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -582,7 +573,6 @@ func TestCreateDetectTemplate(t *testing.T) {
 
 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
-	envconfig.LoadConfig()
 	var s Server
 
 	t.Run("matched", func(t *testing.T) {
diff --git a/server/routes_delete_test.go b/server/routes_delete_test.go
index 33a97a73..2354d730 100644
--- a/server/routes_delete_test.go
+++ b/server/routes_delete_test.go
@@ -10,7 +10,6 @@ import (
 
 	"github.com/gin-gonic/gin"
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/types/model"
 )
 
@@ -19,7 +18,6 @@ func TestDelete(t *testing.T) {
 
 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
-	envconfig.LoadConfig()
 
 	var s Server
 
diff --git a/server/routes_list_test.go b/server/routes_list_test.go
index c2d9c113..29e3214c 100644
--- a/server/routes_list_test.go
+++ b/server/routes_list_test.go
@@ -9,14 +9,12 @@ import (
 
 	"github.com/gin-gonic/gin"
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/envconfig"
 )
 
 func TestList(t *testing.T) {
 	gin.SetMode(gin.TestMode)
 
 	t.Setenv("OLLAMA_MODELS", t.TempDir())
-	envconfig.LoadConfig()
 
 	expectNames := []string{
 		"mistral:7b-instruct-q4_0",
diff --git a/server/routes_test.go b/server/routes_test.go
index 97786ba2..17da2305 100644
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -19,7 +19,6 @@ import (
 	"github.com/stretchr/testify/require"
 
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/parser"
@@ -347,7 +346,6 @@ func Test_Routes(t *testing.T) {
 	}
 
 	t.Setenv("OLLAMA_MODELS", t.TempDir())
-	envconfig.LoadConfig()
 
 	s := &Server{}
 	router := s.GenerateRoutes()
@@ -378,7 +376,6 @@ func Test_Routes(t *testing.T) {
 
 func TestCase(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", t.TempDir())
-	envconfig.LoadConfig()
 
 	cases := []string{
 		"mistral",
@@ -458,7 +455,6 @@ func TestCase(t *testing.T) {
 
 func TestShow(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", t.TempDir())
-	envconfig.LoadConfig()
 
 	var s Server
 

From 78140a712ce8feac6fad2ae2c0043056f1a47fdc Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Fri, 5 Jul 2024 16:52:01 -0700
Subject: [PATCH 16/79] cleanup tests

---
 envconfig/config_test.go | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/envconfig/config_test.go b/envconfig/config_test.go
index 87c808ca..977298aa 100644
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -6,23 +6,8 @@ import (
 	"time"
 
 	"github.com/google/go-cmp/cmp"
-	"github.com/stretchr/testify/require"
 )
 
-func TestSmoke(t *testing.T) {
-	t.Setenv("OLLAMA_DEBUG", "")
-	require.False(t, Debug())
-
-	t.Setenv("OLLAMA_DEBUG", "false")
-	require.False(t, Debug())
-
-	t.Setenv("OLLAMA_DEBUG", "1")
-	require.True(t, Debug())
-
-	t.Setenv("OLLAMA_FLASH_ATTENTION", "1")
-	require.True(t, FlashAttention())
-}
-
 func TestHost(t *testing.T) {
 	cases := map[string]struct {
 		value  string

From 85d9d73a7253fce232208a2355113c8ae6d69353 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 8 Jul 2024 10:34:12 -0700
Subject: [PATCH 17/79] comments

---
 envconfig/config.go      | 50 ++++++++++++++------------
 envconfig/config_test.go | 77 +++++++++++++++++++++++++++++++---------
 server/sched.go          |  4 +--
 3 files changed, 90 insertions(+), 41 deletions(-)

diff --git a/envconfig/config.go b/envconfig/config.go
index 01abea42..b82b773d 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -1,7 +1,6 @@
 package envconfig
 
 import (
-	"errors"
 	"fmt"
 	"log/slog"
 	"math"
@@ -15,15 +14,12 @@ import (
 	"time"
 )
 
-var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")
-
 // Host returns the scheme and host. Host can be configured via the OLLAMA_HOST environment variable.
 // Default is scheme "http" and host "127.0.0.1:11434"
 func Host() *url.URL {
 	defaultPort := "11434"
 
-	s := os.Getenv("OLLAMA_HOST")
-	s = strings.TrimSpace(strings.Trim(strings.TrimSpace(s), "\"'"))
+	s := strings.TrimSpace(Var("OLLAMA_HOST"))
 	scheme, hostport, ok := strings.Cut(s, "://")
 	switch {
 	case !ok:
@@ -48,6 +44,7 @@ func Host() *url.URL {
 	}
 
 	if n, err := strconv.ParseInt(port, 10, 32); err != nil || n > 65535 || n < 0 {
+		slog.Warn("invalid port, using default", "port", port, "default", defaultPort)
 		return &url.URL{
 			Scheme: scheme,
 			Host:   net.JoinHostPort(host, defaultPort),
@@ -62,7 +59,7 @@ func Host() *url.URL {
 
 // Origins returns a list of allowed origins. Origins can be configured via the OLLAMA_ORIGINS environment variable.
 func Origins() (origins []string) {
-	if s := getenv("OLLAMA_ORIGINS"); s != "" {
+	if s := Var("OLLAMA_ORIGINS"); s != "" {
 		origins = strings.Split(s, ",")
 	}
 
@@ -87,7 +84,7 @@ func Origins() (origins []string) {
 // Models returns the path to the models directory. Models directory can be configured via the OLLAMA_MODELS environment variable.
 // Default is $HOME/.ollama/models
 func Models() string {
-	if s, ok := os.LookupEnv("OLLAMA_MODELS"); ok {
+	if s := Var("OLLAMA_MODELS"); s != "" {
 		return s
 	}
 
@@ -104,7 +101,7 @@ func Models() string {
 // Default is 5 minutes.
 func KeepAlive() (keepAlive time.Duration) {
 	keepAlive = 5 * time.Minute
-	if s := os.Getenv("OLLAMA_KEEP_ALIVE"); s != "" {
+	if s := Var("OLLAMA_KEEP_ALIVE"); s != "" {
 		if d, err := time.ParseDuration(s); err == nil {
 			keepAlive = d
 		} else if n, err := strconv.ParseInt(s, 10, 64); err == nil {
@@ -121,7 +118,7 @@ func KeepAlive() (keepAlive time.Duration) {
 
 func Bool(k string) func() bool {
 	return func() bool {
-		if s := getenv(k); s != "" {
+		if s := Var(k); s != "" {
 			b, err := strconv.ParseBool(s)
 			if err != nil {
 				return true
@@ -151,7 +148,7 @@ var (
 
 func String(s string) func() string {
 	return func() string {
-		return getenv(s)
+		return Var(s)
 	}
 }
 
@@ -167,7 +164,7 @@ var (
 )
 
 func RunnersDir() (p string) {
-	if p := getenv("OLLAMA_RUNNERS_DIR"); p != "" {
+	if p := Var("OLLAMA_RUNNERS_DIR"); p != "" {
 		return p
 	}
 
@@ -213,22 +210,29 @@ func RunnersDir() (p string) {
 	return p
 }
 
-func Int(k string, n int) func() int {
-	return func() int {
-		if s := getenv(k); s != "" {
-			if n, err := strconv.ParseInt(s, 10, 64); err == nil && n >= 0 {
-				return int(n)
+func Uint(key string, defaultValue uint) func() uint {
+	return func() uint {
+		if s := Var(key); s != "" {
+			if n, err := strconv.ParseUint(s, 10, 64); err != nil {
+				slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
+			} else {
+				return uint(n)
 			}
 		}
 
-		return n
+		return defaultValue
 	}
 }
 
 var (
-	NumParallel = Int("OLLAMA_NUM_PARALLEL", 0)
-	MaxRunners  = Int("OLLAMA_MAX_LOADED_MODELS", 0)
-	MaxQueue    = Int("OLLAMA_MAX_QUEUE", 512)
+	// NumParallel sets the number of parallel model requests. NumParallel can be configured via the OLLAMA_NUM_PARALLEL environment variable.
+	NumParallel = Uint("OLLAMA_NUM_PARALLEL", 0)
+	// MaxRunners sets the maximum number of loaded models. MaxRunners can be configured via the OLLAMA_MAX_LOADED_MODELS environment variable.
+	MaxRunners = Uint("OLLAMA_MAX_LOADED_MODELS", 0)
+	// MaxQueue sets the maximum number of queued requests. MaxQueue can be configured via the OLLAMA_MAX_QUEUE environment variable.
+	MaxQueue = Uint("OLLAMA_MAX_QUEUE", 512)
+	// MaxVRAM sets a maximum VRAM override in bytes. MaxVRAM can be configured via the OLLAMA_MAX_VRAM environment variable.
+	MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
 )
 
 type EnvVar struct {
@@ -274,7 +278,7 @@ func Values() map[string]string {
 	return vals
 }
 
-// getenv returns an environment variable stripped of leading and trailing quotes or spaces
-func getenv(key string) string {
-	return strings.Trim(os.Getenv(key), "\"' ")
+// Var returns an environment variable stripped of leading and trailing quotes or spaces
+func Var(key string) string {
+	return strings.Trim(strings.TrimSpace(os.Getenv(key)), "\"'")
 }
diff --git a/envconfig/config_test.go b/envconfig/config_test.go
index 977298aa..92a500f1 100644
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -30,6 +30,10 @@ func TestHost(t *testing.T) {
 		"extra quotes":        {"\"1.2.3.4\"", "1.2.3.4:11434"},
 		"extra space+quotes":  {" \" 1.2.3.4 \" ", "1.2.3.4:11434"},
 		"extra single quotes": {"'1.2.3.4'", "1.2.3.4:11434"},
+		"http":                {"http://1.2.3.4", "1.2.3.4:80"},
+		"http port":           {"http://1.2.3.4:4321", "1.2.3.4:4321"},
+		"https":               {"https://1.2.3.4", "1.2.3.4:443"},
+		"https port":          {"https://1.2.3.4:4321", "1.2.3.4:4321"},
 	}
 
 	for name, tt := range cases {
@@ -133,24 +137,45 @@ func TestOrigins(t *testing.T) {
 }
 
 func TestBool(t *testing.T) {
-	cases := map[string]struct {
-		value  string
-		expect bool
-	}{
-		"empty":     {"", false},
-		"true":      {"true", true},
-		"false":     {"false", false},
-		"1":         {"1", true},
-		"0":         {"0", false},
-		"random":    {"random", true},
-		"something": {"something", true},
+	cases := map[string]bool{
+		"":      false,
+		"true":  true,
+		"false": false,
+		"1":     true,
+		"0":     false,
+		// invalid values
+		"random":    true,
+		"something": true,
 	}
 
-	for name, tt := range cases {
-		t.Run(name, func(t *testing.T) {
-			t.Setenv("OLLAMA_BOOL", tt.value)
-			if b := Bool("OLLAMA_BOOL"); b() != tt.expect {
-				t.Errorf("%s: expected %t, got %t", name, tt.expect, b())
+	for k, v := range cases {
+		t.Run(k, func(t *testing.T) {
+			t.Setenv("OLLAMA_BOOL", k)
+			if b := Bool("OLLAMA_BOOL")(); b != v {
+				t.Errorf("%s: expected %t, got %t", k, v, b)
+			}
+		})
+	}
+}
+
+func TestUint(t *testing.T) {
+	cases := map[string]uint{
+		"0":    0,
+		"1":    1,
+		"1337": 1337,
+		// default values
+		"":       11434,
+		"-1":     11434,
+		"0o10":   11434,
+		"0x10":   11434,
+		"string": 11434,
+	}
+
+	for k, v := range cases {
+		t.Run(k, func(t *testing.T) {
+			t.Setenv("OLLAMA_UINT", k)
+			if i := Uint("OLLAMA_UINT", 11434)(); i != v {
+				t.Errorf("%s: expected %d, got %d", k, v, i)
 			}
 		})
 	}
@@ -188,3 +213,23 @@ func TestKeepAlive(t *testing.T) {
 		})
 	}
 }
+
+func TestVar(t *testing.T) {
+	cases := map[string]string{
+		"value":       "value",
+		" value ":     "value",
+		" 'value' ":   "value",
+		` "value" `:   "value",
+		" ' value ' ": " value ",
+		` " value " `: " value ",
+	}
+
+	for k, v := range cases {
+		t.Run(k, func(t *testing.T) {
+			t.Setenv("OLLAMA_VAR", k)
+			if s := Var("OLLAMA_VAR"); s != v {
+				t.Errorf("%s: expected %q, got %q", k, v, s)
+			}
+		})
+	}
+}
diff --git a/server/sched.go b/server/sched.go
index 610a2c50..ce2945d8 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -129,7 +129,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 				slog.Debug("pending request cancelled or timed out, skipping scheduling")
 				continue
 			}
-			numParallel := envconfig.NumParallel()
+			numParallel := int(envconfig.NumParallel())
 			// TODO (jmorganca): multimodal models don't support parallel yet
 			// see https://github.com/ollama/ollama/issues/4165
 			if len(pending.model.ProjectorPaths) > 0 && numParallel != 1 {
@@ -151,7 +151,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						pending.useLoadedRunner(runner, s.finishedReqCh)
 						break
 					}
-				} else if envconfig.MaxRunners() > 0 && loadedCount >= envconfig.MaxRunners() {
+				} else if envconfig.MaxRunners() > 0 && loadedCount >= int(envconfig.MaxRunners()) {
 					slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
 					runnerToExpire = s.findRunnerToUnload()
 				} else {

From d835368eb8599b4f4c2f8a766bad5b57498a988d Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Mon, 22 Jul 2024 16:16:22 -0400
Subject: [PATCH 18/79] convert: capture `head_dim` for mistral (#5818)

---
 convert/mistral.go | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/convert/mistral.go b/convert/mistral.go
index da6874cf..8fe066d6 100644
--- a/convert/mistral.go
+++ b/convert/mistral.go
@@ -71,6 +71,11 @@ func (m *MistralModel) WriteGGUF(ws io.WriteSeeker) error {
 		"tokenizer.ggml.unknown_token_id": uint32(0),
 	}
 
+	if m.Params.HeadDimension > 0 {
+		kv["llama.attention.key_length"] = uint32(m.Params.HeadDimension)
+		kv["llama.attention.value_length"] = uint32(m.Params.HeadDimension)
+	}
+
 	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
 }
 

From c0648233f2236f82f6830d2aaed552ae0f72379b Mon Sep 17 00:00:00 2001
From: royjhan <65097070+royjhan@users.noreply.github.com>
Date: Mon, 22 Jul 2024 13:37:08 -0700
Subject: [PATCH 19/79] api embed docs (#5282)

---
 docs/api.md | 84 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 76 insertions(+), 8 deletions(-)

diff --git a/docs/api.md b/docs/api.md
index c577bb1a..4381c376 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -1026,7 +1026,7 @@ If `stream` is set to `false`, then the response is a single JSON object:
 ## Generate Embeddings
 
 ```shell
-POST /api/embeddings
+POST /api/embed
 ```
 
 Generate embeddings from a model
@@ -1034,10 +1034,11 @@ Generate embeddings from a model
 ### Parameters
 
 - `model`: name of model to generate embeddings from
-- `prompt`: text to generate embeddings for
+- `input`: text or list of text to generate embeddings for
 
 Advanced parameters:
 
+- `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true`
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
 
@@ -1046,9 +1047,9 @@ Advanced parameters:
 #### Request
 
 ```shell
-curl http://localhost:11434/api/embeddings -d '{
+curl http://localhost:11434/api/embed -d '{
   "model": "all-minilm",
-  "prompt": "Here is an article about llamas..."
+  "input": "Why is the sky blue?"
 }'
 ```
 
@@ -1056,10 +1057,35 @@ curl http://localhost:11434/api/embeddings -d '{
 
 ```json
 {
-  "embedding": [
-    0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313,
-    0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281
-  ]
+  "model": "all-minilm",
+  "embeddings": [[
+    0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
+    0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
+  ]]
+}
+```
+
+#### Request (Multiple input)
+
+```shell
+curl http://localhost:11434/api/embed -d '{
+  "model": "all-minilm",
+  "input": ["Why is the sky blue?", "Why is the grass green?"]
+}'
+```
+
+#### Response
+
+```json
+{
+  "model": "all-minilm",
+  "embeddings": [[
+    0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
+    0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
+  ],[
+    -0.0098027075, 0.06042469, 0.025257962, -0.006364387, 0.07272725,
+    0.017194884, 0.09032035, -0.051705178, 0.09951512, 0.09072481
+  ]]
 }
 ```
 
@@ -1106,3 +1132,45 @@ A single JSON object will be returned.
   ]
 }
 ```
+
+## Generate Embedding
+
+> Note: this endpoint has been superseded by `/api/embed`
+
+```shell
+POST /api/embeddings
+```
+
+Generate embeddings from a model
+
+### Parameters
+
+- `model`: name of model to generate embeddings from
+- `prompt`: text to generate embeddings for
+
+Advanced parameters:
+
+- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
+- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
+
+### Examples
+
+#### Request
+
+```shell
+curl http://localhost:11434/api/embeddings -d '{
+  "model": "all-minilm",
+  "prompt": "Here is an article about llamas..."
+}'
+```
+
+#### Response
+
+```json
+{
+  "embedding": [
+    0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313,
+    0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281
+  ]
+}
+```
\ No newline at end of file

From 83a0cb8d88561b4302baa8b6ea0623c426483e5d Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Tue, 2 Jul 2024 14:52:18 -0700
Subject: [PATCH 20/79] docs

---
 docs/template.md | 173 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 173 insertions(+)
 create mode 100644 docs/template.md

diff --git a/docs/template.md b/docs/template.md
new file mode 100644
index 00000000..8f41e8fb
--- /dev/null
+++ b/docs/template.md
@@ -0,0 +1,173 @@
+# Template
+
+Ollama provides a powerful templating engine backed by Go's built-in templating engine to construct prompts for your large language model. This feature is a valuable tool to get the most out of your models.
+
+## Basic Template Structure
+
+A basic Go template consists of three main parts:
+
+* **Layout**: The overall structure of the template.
+* **Variables**: Placeholders for dynamic data that will be replaced with actual values when the template is rendered.
+* **Functions**: Custom functions or logic that can be used to manipulate the template's content.
+
+Here's an example of a simple chat template:
+
+```gotmpl
+{{- range .Messages }}
+{{ .Role }}: {{ .Content }}
+{{- end }}
+```
+
+In this example, we have:
+
+* A basic messages structure (layout)
+* Three variables: `Messages`, `Role`, and `Content` (variables)
+* A custom function (action) that iterates over an array of items (`range .Messages`) and displays each item
+
+## Adding Templates to Your Model
+
+By default, models imported into Ollama have a default template of `{{ .Prompt }}`, i.e. user inputs are sent verbatim to the LLM. This is appropriate for text or code completion models but lacks essential markers for chat or instruction models.
+
+Omitting a template in these models puts the responsibility of correctly templating input onto the user. Adding a template allows users to easily get the best results from the model.
+
+To add templates in your model, you'll need to add a `TEMPLATE` command to the Modelfile. Here's an example using Meta's Llama 3.
+
+```dockerfile
+FROM llama3
+
+TEMPLATE """{{- if .System }}<|start_header_id|>system<|end_header_id|>
+
+{{ .System }}<|eot_id|>
+{{- end }}
+{{- range .Messages }}<|start_header_id|>{{ .Role }}<|end_header_id|>
+
+{{ .Content }}<|eot_id|>
+{{- end }}<|start_header_id|>assistant<|end_header_id|>
+
+"""
+```
+
+## Variables
+
+`System` (string): system prompt
+
+`Prompt` (string): user prompt
+
+`Response` (string): assistant response
+
+`Suffix` (string): text inserted after the assistant's response
+
+`Messages` (list): list of messages
+
+`Messages[].Role` (string): role which can be one of `system`, `user`, `assistant`, or `tool`
+
+`Messages[].Content` (string):  message content
+
+`Messages[].ToolCalls` (list): list of tools the model wants to call
+
+`Messages[].ToolCalls[].Function` (object): function to call
+
+`Messages[].ToolCalls[].Function.Name` (string): function name
+
+`Messages[].ToolCalls[].Function.Arguments` (map): mapping of argument name to argument value
+
+`Tools` (list): list of tools the model can access
+
+`Tools[].Type` (string): schema type. `type` is always `function`
+
+`Tools[].Function` (object): function definition
+
+`Tools[].Function.Name` (string): function name
+
+`Tools[].Function.Description` (string): function description
+
+`Tools[].Function.Parameters` (object): function parameters
+
+`Tools[].Function.Parameters.Type` (string): schema type. `type` is always `object`
+
+`Tools[].Function.Parameters.Required` (list): list of required properties
+
+`Tools[].Function.Parameters.Properties` (map): mapping of property name to property definition
+
+`Tools[].Function.Parameters.Properties[].Type` (string): property type
+
+`Tools[].Function.Parameters.Properties[].Description` (string): property description
+
+`Tools[].Function.Parameters.Properties[].Enum` (list): list of valid values
+
+## Tips and Best Practices
+
+Keep the following tips and best practices in mind when working with Go templates:
+
+* **Be mindful of dot**: Control flow structures like `range` and `with` changes the value `.`
+* **Out-of-scope variables**: Use `$.` to reference variables not currently in scope, starting from the root
+* **Whitespace control**: Use `-` to trim leading (`{{-`) and trailing (`-}}`) whitespace
+
+## Examples
+
+### Example Messages
+
+#### ChatML
+
+ChatML is a popular template format. It can be used for models such as Databrick's DBRX, Intel's Neural Chat, and Microsoft's Orca 2.
+
+```gotmpl
+{{- if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
+{{ end }}
+{{- range .Messages }}<|im_start|>{{ .Role }}
+{{ .Content }}<|im_end|>
+{{ end }}<|im_start|>assistant
+{{ else }}
+{{ if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
+```
+
+### Example Tools
+
+Tools support can be added to a model by adding a `{{ .Tools }}` node to the template. This feature is useful for models trained to call external tools and can a powerful tool for retrieving real-time data or performing complex tasks.
+
+#### Mistral
+
+Mistral v0.3 and Mixtral 8x22B supports tool calling.
+
+```gotmpl
+{{- range $index, $_ := .Messages }}
+{{- if eq .Role "user" }}
+{{- if and (le (len (slice $.Messages $index)) 2) $.Tools }}[AVAILABLE_TOOLS] {{ json $.Tools }}[/AVAILABLE_TOOLS]
+{{- end }}[INST] {{ if and (eq (len (slice $.Messages $index)) 1) $.System }}{{ $.System }}
+
+{{ end }}{{ .Content }}[/INST]
+{{- else if eq .Role "assistant" }}
+{{- if .Content }} {{ .Content }}</s>
+{{- else if .ToolCalls }}[TOOL_CALLS] [
+{{- range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ json .Function.Arguments }}}
+{{- end }}]</s>
+{{- end }}
+{{- else if eq .Role "tool" }}[TOOL_RESULTS] {"content": {{ .Content }}}[/TOOL_RESULTS]
+{{- end }}
+{{- end }}
+```
+
+### Example Fill-in-Middle
+
+Fill-in-middle support can be added to a model by adding a `{{ .Suffix }}` node to the template. This feature is useful for models that are trained to generate text in the middle of user input, such as code completion models.
+
+#### CodeLlama
+
+CodeLlama [7B](https://ollama.com/library/codellama:7b-code) and [13B](https://ollama.com/library/codellama:13b-code) code completion models support fill-in-middle.
+
+```gotmpl
+<PRE> {{ .Prompt }} <SUF>{{ .Suffix }} <MID>
+```
+
+> [!NOTE]
+> CodeLlama 34B and 70B code completion and all instruct and Python fine-tuned models do not support fill-in-middle.
+
+#### Codestral
+
+Codestral [22B](https://ollama.com/library/codestral:22b) supports fill-in-middle.
+
+```gotmpl
+[SUFFIX]{{ .Suffix }}[PREFIX] {{ .Prompt }}
+```

From 9b60a038e5169c4a69bc513ae6e7ea1816f9fc11 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 22 Jul 2024 13:34:56 -0700
Subject: [PATCH 21/79] update api.md

---
 README.md         |   3 +-
 docs/api.md       | 117 +++++++++++++++++++++++++++++++++++++++++++++-
 docs/modelfile.md |   3 +-
 3 files changed, 119 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index b96f4c16..02ab7051 100644
--- a/README.md
+++ b/README.md
@@ -64,7 +64,8 @@ Here are some example models that can be downloaded:
 | LLaVA              | 7B         | 4.5GB | `ollama run llava`             |
 | Solar              | 10.7B      | 6.1GB | `ollama run solar`             |
 
-> Note: You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
+> [!NOTE]
+> You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
 
 ## Customize a model
 
diff --git a/docs/api.md b/docs/api.md
index c577bb1a..bf4c8ce8 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -40,6 +40,7 @@ Generate a response for a given prompt with a provided model. This is a streamin
 
 - `model`: (required) the [model name](#model-names)
 - `prompt`: the prompt to generate a response for
+- `suffix`: the text after the model response
 - `images`: (optional) a list of base64-encoded images (for multimodal models such as `llava`)
 
 Advanced parameters (optional):
@@ -57,7 +58,8 @@ Advanced parameters (optional):
 
 Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#request-json-mode) below.
 
-> Note: it's important to instruct the model to use JSON in the `prompt`. Otherwise, the model may generate large amounts whitespace.
+> [!IMPORTANT]
+> It's important to instruct the model to use JSON in the `prompt`. Otherwise, the model may generate large amounts whitespace.
 
 ### Examples
 
@@ -148,8 +150,44 @@ If `stream` is set to `false`, the response will be a single JSON object:
 }
 ```
 
+#### Request (with suffix)
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "codellama:code",
+  "prompt": "def compute_gcd(a, b):",
+  "suffix": "    return result",
+  "options": {
+    "temperature": 0
+  },
+  "stream": false
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "codellama:code",
+  "created_at": "2024-07-22T20:47:51.147561Z",
+  "response": "\n  if a == 0:\n    return b\n  else:\n    return compute_gcd(b % a, a)\n\ndef compute_lcm(a, b):\n  result = (a * b) / compute_gcd(a, b)\n",
+  "done": true,
+  "done_reason": "stop",
+  "context": [...],
+  "total_duration": 1162761250,
+  "load_duration": 6683708,
+  "prompt_eval_count": 17,
+  "prompt_eval_duration": 201222000,
+  "eval_count": 63,
+  "eval_duration": 953997000
+}
+```
+
 #### Request (JSON mode)
 
+> [!IMPORTANT]
 > When `format` is set to `json`, the output will always be a well-formed JSON object. It's important to also instruct the model to respond in JSON.
 
 ##### Request
@@ -383,9 +421,10 @@ Generate the next message in a chat with a provided model. This is a streaming e
 
 The `message` object has the following fields:
 
-- `role`: the role of the message, either `system`, `user` or `assistant`
+- `role`: the role of the message, either `system`, `user`, `assistant`, or `tool`
 - `content`: the content of the message
 - `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
+- `tool_calls` (optional): a list of tools the model wants to use
 
 Advanced parameters (optional):
 
@@ -393,6 +432,7 @@ Advanced parameters (optional):
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
+- `tools`: external tools the model can use. Not all models support this feature.
 
 ### Examples
 
@@ -622,6 +662,79 @@ curl http://localhost:11434/api/chat -d '{
 }
 ```
 
+#### Chat request (with tools)
+
+##### Request
+
+```
+curl http://localhost:11434/api/chat -d '{
+  "model": "mistral",
+  "messages": [
+    {
+      "role": "user",
+      "content": "What is the weather today in Paris?"
+    }
+  ],
+  "stream": false,
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather for a location",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "location": {
+              "type": "string",
+              "description": "The location to get the weather for, e.g. San Francisco, CA"
+            },
+            "format": {
+              "type": "string",
+              "description": "The format to return the weather in, e.g. 'celsius' or 'fahrenheit'",
+              "enum": ["celsius", "fahrenheit"]
+            }
+          },
+          "required": ["location", "format"]
+        }
+      }
+    }
+  ]
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "mistral:7b-instruct-v0.3-q4_K_M",
+  "created_at": "2024-07-22T20:33:28.123648Z",
+  "message": {
+    "role": "assistant",
+    "content": "",
+    "tool_calls": [
+      {
+        "function": {
+          "name": "get_current_weather",
+          "arguments": {
+            "format": "celsius",
+            "location": "Paris, FR"
+          }
+        }
+      }
+    ]
+  },
+  "done_reason": "stop",
+  "done": true,
+  "total_duration": 885095291,
+  "load_duration": 3753500,
+  "prompt_eval_count": 122,
+  "prompt_eval_duration": 328493000,
+  "eval_count": 33,
+  "eval_duration": 552222000
+}
+```
+
 ## Create a Model
 
 ```shell
diff --git a/docs/modelfile.md b/docs/modelfile.md
index 21ee1826..c3645b06 100644
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -1,6 +1,7 @@
 # Ollama Model File
 
-> Note: `Modelfile` syntax is in development
+> [!NOTE]
+> `Modelfile` syntax is in development
 
 A model file is the blueprint to create and share models with Ollama.
 

From e12fff8810e37bfabe4416f7f41902387ff3aae1 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Mon, 15 Jul 2024 09:25:56 -0700
Subject: [PATCH 22/79] Enable windows error dialog for subprocess startup

Make sure if something goes wrong spawning the process, the user gets
enough info to be able to try to self correct, or at least file a bug
with details so we can fix it.  Once the process starts, we immediately
change back to the recommended setting to prevent the blocking dialog.
This ensures if the model fails to load (OOM, unsupported model type,
etc.) the process will exit quickly and we can scan the stdout/stderr
of the subprocess for the reason to report via API.
---
 llm/ext_server/server.cpp |  4 ++++
 llm/llm_darwin_amd64.go   |  3 +++
 llm/llm_darwin_arm64.go   |  3 +++
 llm/llm_linux.go          |  7 ++++++-
 llm/llm_windows.go        | 16 +++++++++++++++-
 llm/server.go             |  1 +
 6 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp
index e8a076c4..14d921c0 100644
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -41,6 +41,7 @@
 
 #if defined(_WIN32)
 #include <windows.h>
+#include <errhandlingapi.h>
 #endif
 
 #include <cstddef>
@@ -2737,6 +2738,9 @@ int wmain(int argc, wchar_t **wargv) {
     for (int i = 0; i < argc; ++i) {
         argv[i] = wchar_to_char(wargv[i]);
     }
+
+    // Adjust error mode to avoid error dialog after we start.
+    SetErrorMode(SEM_FAILCRITICALERRORS);
 #else
 int main(int argc, char **argv) {
 #endif
diff --git a/llm/llm_darwin_amd64.go b/llm/llm_darwin_amd64.go
index 3093e1ad..60eed719 100644
--- a/llm/llm_darwin_amd64.go
+++ b/llm/llm_darwin_amd64.go
@@ -2,7 +2,10 @@ package llm
 
 import (
 	"embed"
+	"syscall"
 )
 
 //go:embed build/darwin/x86_64/*/bin/*
 var libEmbed embed.FS
+
+var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
diff --git a/llm/llm_darwin_arm64.go b/llm/llm_darwin_arm64.go
index 928f0b82..20ce8552 100644
--- a/llm/llm_darwin_arm64.go
+++ b/llm/llm_darwin_arm64.go
@@ -2,7 +2,10 @@ package llm
 
 import (
 	"embed"
+	"syscall"
 )
 
 //go:embed build/darwin/arm64/*/bin/*
 var libEmbed embed.FS
+
+var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
diff --git a/llm/llm_linux.go b/llm/llm_linux.go
index c2c5c4cb..928b4e79 100644
--- a/llm/llm_linux.go
+++ b/llm/llm_linux.go
@@ -1,6 +1,11 @@
 package llm
 
-import "embed"
+import (
+	"embed"
+	"syscall"
+)
 
 //go:embed build/linux/*/*/bin/*
 var libEmbed embed.FS
+
+var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
diff --git a/llm/llm_windows.go b/llm/llm_windows.go
index e44f4b95..763cccf9 100644
--- a/llm/llm_windows.go
+++ b/llm/llm_windows.go
@@ -1,6 +1,20 @@
 package llm
 
-import "embed"
+import (
+	"embed"
+	"syscall"
+)
 
 // unused on windows
 var libEmbed embed.FS
+
+const CREATE_DEFAULT_ERROR_MODE = 0x04000000
+
+var LlamaServerSysProcAttr = &syscall.SysProcAttr{
+	// Wire up the default error handling logic If for some reason a DLL is
+	// missing in the path this will pop up a GUI Dialog explaining the fault so
+	// the user can either fix their PATH, or report a bug. Without this
+	// setting, the process exits immediately with a generic exit status but no
+	// way to (easily) figure out what the actual missing DLL was.
+	CreationFlags: CREATE_DEFAULT_ERROR_MODE,
+}
diff --git a/llm/server.go b/llm/server.go
index 08463ef0..55732773 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -346,6 +346,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		s.cmd.Env = os.Environ()
 		s.cmd.Stdout = os.Stdout
 		s.cmd.Stderr = s.status
+		s.cmd.SysProcAttr = LlamaServerSysProcAttr
 
 		envWorkarounds := [][2]string{}
 		for _, gpu := range gpus {

From db0968f30c895b9f2059da48800018739ef9bca7 Mon Sep 17 00:00:00 2001
From: Josh <76125168+joshyan1@users.noreply.github.com>
Date: Mon, 22 Jul 2024 15:48:15 -0700
Subject: [PATCH 23/79] fix dupe err message (#5857)

---
 server/routes.go | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/server/routes.go b/server/routes.go
index 0d7ca003..e6ffe526 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -609,10 +609,9 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
 		defer cancel()
 
 		quantization := cmp.Or(r.Quantize, r.Quantization)
-		if err := CreateModel(ctx, name, filepath.Dir(r.Path), strings.ToUpper(quantization), f, fn); err != nil {
-			if errors.Is(err, errBadTemplate) {
-				ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest}
-			}
+		if err := CreateModel(ctx, name, filepath.Dir(r.Path), strings.ToUpper(quantization), f, fn); errors.Is(err, errBadTemplate) {
+			ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest}
+		} else if err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()

From 5d604eec5bbaba840fcee8cac8574807f3656ea8 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Mon, 22 Jul 2024 16:16:28 -0700
Subject: [PATCH 24/79] Bump Go patch version

---
 .github/workflows/release.yaml | 10 +++++-----
 .github/workflows/test.yaml    | 10 +++++-----
 Dockerfile                     |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 5ae630c3..f0c6db5d 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -31,7 +31,7 @@ jobs:
           security set-keychain-settings -lut 3600 build.keychain
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - name: Build Darwin
         env:
@@ -87,7 +87,7 @@ jobs:
           write-host "plugin installed"
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - run: go get ./...
       - run: |
@@ -141,7 +141,7 @@ jobs:
           write-host "plugin installed"
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - name: 'Install ROCm'
         run: |
@@ -218,7 +218,7 @@ jobs:
           write-host "plugin installed"
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - name: 'Install CUDA'
         run: |
@@ -306,7 +306,7 @@ jobs:
           write-host "plugin installed"
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - run: go get
       - uses: actions/download-artifact@v4
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 90fef6e5..5e002a22 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -63,7 +63,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - run: go get ./...
       - run: |
@@ -163,7 +163,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - name: 'Install ROCm'
         run: |
@@ -200,7 +200,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - name: 'Install CUDA'
         run: |
@@ -255,7 +255,7 @@ jobs:
           submodules: recursive
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: false
       - run: |
           case ${{ matrix.arch }} in
@@ -297,7 +297,7 @@ jobs:
           submodules: recursive
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - run: |
           case ${{ matrix.arch }} in
diff --git a/Dockerfile b/Dockerfile
index ca393496..c8efdd8a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-ARG GOLANG_VERSION=1.22.1
+ARG GOLANG_VERSION=1.22.5
 ARG CMAKE_VERSION=3.22.1
 # this CUDA_VERSION corresponds with the one specified in docs/gpu.md
 ARG CUDA_VERSION=11.3.1

From a6cd8f6169c029c92105962017562274bd90626b Mon Sep 17 00:00:00 2001
From: Ajay Chintala <nitk.ajay@gmail.com>
Date: Tue, 23 Jul 2024 11:40:23 -0700
Subject: [PATCH 25/79] Update README.md to add LLMStack integration (#5799)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index b96f4c16..6a06b819 100644
--- a/README.md
+++ b/README.md
@@ -296,6 +296,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
 - [AI Studio](https://github.com/MindWorkAI/AI-Studio)
 - [Sidellama](https://github.com/gyopak/sidellama) (browser-based LLM client)
+- [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
 
 ### Terminal
 

From 830fdd271536ee257db72c29c2be5b5629e58389 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Tue, 23 Jul 2024 15:14:28 -0700
Subject: [PATCH 26/79] Better explain multi-gpu behavior

---
 cmd/cmd.go  | 1 +
 docs/faq.md | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index b761d018..610fddcb 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1341,6 +1341,7 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_NUM_PARALLEL"],
 				envVars["OLLAMA_NOPRUNE"],
 				envVars["OLLAMA_ORIGINS"],
+				envVars["OLLAMA_SCHED_SPREAD"],
 				envVars["OLLAMA_TMPDIR"],
 				envVars["OLLAMA_FLASH_ATTENTION"],
 				envVars["OLLAMA_LLM_LIBRARY"],
diff --git a/docs/faq.md b/docs/faq.md
index da1848f7..16c80549 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -272,4 +272,8 @@ The following server settings may be used to adjust how Ollama handles concurren
 - `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time.  The default will auto-select either 4 or 1 based on available memory.
 - `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512
 
-Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting.  Once ROCm v6.2 is available, Windows Radeon will follow the defaults above.  You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM.
\ No newline at end of file
+Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting.  Once ROCm v6.2 is available, Windows Radeon will follow the defaults above.  You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM.
+
+## How does Ollama load models on multiple GPUs?
+
+Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models.  When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available.  If the model will entirely fit on any single GPU, Ollama will load the model on that GPU.  This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference.  If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
\ No newline at end of file

From ac33aa7d3782887878e6e24fb4a6238356a489a6 Mon Sep 17 00:00:00 2001
From: royjhan <65097070+royjhan@users.noreply.github.com>
Date: Wed, 24 Jul 2024 11:15:46 -0700
Subject: [PATCH 27/79] Fix Embed Test Flakes (#5893)

* float cmp

* increase tolerance
---
 integration/embed_test.go | 59 +++++++++++++++++++++++++++++++++++----
 1 file changed, 54 insertions(+), 5 deletions(-)

diff --git a/integration/embed_test.go b/integration/embed_test.go
index aeafa57b..61b36fa2 100644
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@@ -4,12 +4,45 @@ package integration
 
 import (
 	"context"
+	"math"
 	"testing"
 	"time"
 
 	"github.com/ollama/ollama/api"
 )
 
+func floatsEqual32(a, b float32) bool {
+	return math.Abs(float64(a-b)) <= 1e-4
+}
+
+func floatsEqual64(a, b float64) bool {
+	return math.Abs(a-b) <= 1e-4
+}
+
+func TestAllMiniLMEmbeddings(t *testing.T) {
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	defer cancel()
+
+	req := api.EmbeddingRequest{
+		Model:  "all-minilm",
+		Prompt: "why is the sky blue?",
+	}
+
+	res, err := embeddingTestHelper(ctx, t, req)
+
+	if err != nil {
+		t.Fatalf("error: %v", err)
+	}
+
+	if len(res.Embedding) != 384 {
+		t.Fatalf("expected 384 floats, got %d", len(res.Embedding))
+	}
+
+	if !floatsEqual64(res.Embedding[0], 0.06642947345972061) {
+		t.Fatalf("expected 0.06642947345972061, got %.16f", res.Embedding[0])
+	}
+}
+
 func TestAllMiniLMEmbed(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
@@ -33,8 +66,8 @@ func TestAllMiniLMEmbed(t *testing.T) {
 		t.Fatalf("expected 384 floats, got %d", len(res.Embeddings[0]))
 	}
 
-	if res.Embeddings[0][0] != 0.010071031 {
-		t.Fatalf("expected 0.010071031, got %f", res.Embeddings[0][0])
+	if !floatsEqual32(res.Embeddings[0][0], 0.010071031) {
+		t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
 	}
 }
 
@@ -61,12 +94,12 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
 		t.Fatalf("expected 384 floats, got %d", len(res.Embeddings[0]))
 	}
 
-	if res.Embeddings[0][0] != 0.010071031 || res.Embeddings[1][0] != -0.009802706 {
-		t.Fatalf("expected 0.010071031 and -0.009802706, got %f and %f", res.Embeddings[0][0], res.Embeddings[1][0])
+	if !floatsEqual32(res.Embeddings[0][0], 0.010071031) || !floatsEqual32(res.Embeddings[1][0], -0.009802706) {
+		t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
 	}
 }
 
-func TestAllMiniLmEmbedTruncate(t *testing.T) {
+func TestAllMiniLMEmbedTruncate(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
 
@@ -135,6 +168,22 @@ func TestAllMiniLmEmbedTruncate(t *testing.T) {
 	}
 }
 
+func embeddingTestHelper(ctx context.Context, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) {
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+	if err := PullIfMissing(ctx, client, req.Model); err != nil {
+		t.Fatalf("failed to pull model %s: %v", req.Model, err)
+	}
+
+	response, err := client.Embeddings(ctx, &req)
+
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
 func embedTestHelper(ctx context.Context, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()

From bb46bbcf5e90e5efab5ff946a6c798131907ba2d Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 24 Jul 2024 13:05:59 -0700
Subject: [PATCH 28/79] llm(llama): pass rope factors (#5924)

---
 llm/patches/0001-llama-3.1-rope-scaling.diff | 71 ++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 llm/patches/0001-llama-3.1-rope-scaling.diff

diff --git a/llm/patches/0001-llama-3.1-rope-scaling.diff b/llm/patches/0001-llama-3.1-rope-scaling.diff
new file mode 100644
index 00000000..45dcb4f5
--- /dev/null
+++ b/llm/patches/0001-llama-3.1-rope-scaling.diff
@@ -0,0 +1,71 @@
+From 2f872f294fb6f5c6e8f983b68c40ea656053dd92 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Tue, 23 Jul 2024 14:33:29 -0700
+Subject: [PATCH] llama 3.1 rope scaling
+
+---
+ src/llama.cpp | 14 ++++++++++++--
+ 1 file changed, 12 insertions(+), 2 deletions(-)
+
+diff --git a/src/llama.cpp b/src/llama.cpp
+index 8fe51971..a9969df8 100644
+--- a/src/llama.cpp
++++ b/src/llama.cpp
+@@ -2472,6 +2472,7 @@ struct llama_layer {
+     // long rope factors
+     struct ggml_tensor * rope_long  = nullptr;
+     struct ggml_tensor * rope_short = nullptr;
++    struct ggml_tensor * rope_freqs = nullptr;
+ 
+     // bitnet scale
+     struct ggml_tensor * wq_scale;
+@@ -6143,6 +6144,8 @@ static bool llm_load_tensors(
+ 
+                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ 
++                        layer.rope_freqs  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS,  "weight"), { n_embd/n_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
++
+                         if (n_expert == 0) {
+                             layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                             layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+@@ -8620,6 +8623,10 @@ struct llm_build_context {
+         // choose long/short freq factors based on the context size
+         const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
+ 
++        if (model.layers[il].rope_freqs != nullptr) {
++            return model.layers[il].rope_freqs;
++        }
++
+         if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
+             return model.layers[il].rope_long;
+         }
+@@ -8814,6 +8821,9 @@ struct llm_build_context {
+ 
+             // self-attention
+             {
++                // rope freq factors for llama3; may return nullptr for llama2 and other models
++                struct ggml_tensor * rope_factors = build_rope_factors(il);
++
+                 // compute Q and K and RoPE them
+                 struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                 cb(Qcur, "Qcur", il);
+@@ -8837,14 +8847,14 @@ struct llm_build_context {
+                 }
+ 
+                 Qcur = ggml_rope_ext(
+-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
++                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
+                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                     ext_factor, attn_factor, beta_fast, beta_slow
+                 );
+                 cb(Qcur, "Qcur", il);
+ 
+                 Kcur = ggml_rope_ext(
+-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
++                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
+                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                     ext_factor, attn_factor, beta_fast, beta_slow
+                 );
+-- 
+2.45.2
+

From 7c2a157ca4a9188c9d0e0c0a03a6bd9d163ba464 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Wed, 24 Jul 2024 13:43:26 -0700
Subject: [PATCH 29/79] Ensure amd gpu nodes are numerically sorted

For systems that enumerate over 10 CPUs the default lexicographical
sort order interleaves CPUs and GPUs.
---
 gpu/amd_linux.go | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/gpu/amd_linux.go b/gpu/amd_linux.go
index 15b6fc61..6493af9e 100644
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -10,6 +10,7 @@ import (
 	"path/filepath"
 	"regexp"
 	"slices"
+	"sort"
 	"strconv"
 	"strings"
 
@@ -82,6 +83,20 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 	// The amdgpu driver always exposes the host CPU(s) first, but we have to skip them and subtract
 	// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
 	matches, _ := filepath.Glob(GPUPropertiesFileGlob)
+	sort.Slice(matches, func(i, j int) bool {
+		// /sys/class/kfd/kfd/topology/nodes/<number>/properties
+		a, err := strconv.ParseInt(filepath.Base(filepath.Dir(matches[i])), 10, 64)
+		if err != nil {
+			slog.Debug("parse err", "error", err, "match", matches[i])
+			return false
+		}
+		b, err := strconv.ParseInt(filepath.Base(filepath.Dir(matches[j])), 10, 64)
+		if err != nil {
+			slog.Debug("parse err", "error", err, "match", matches[i])
+			return false
+		}
+		return a < b
+	})
 	cpuCount := 0
 	for _, match := range matches {
 		slog.Debug("evaluating amdgpu node " + match)

From 6c2129d5d0692f18e677c48d5ea7e015ecae5015 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Wed, 24 Jul 2024 15:22:00 -0700
Subject: [PATCH 30/79] Explain font problems on windows 10

---
 docs/windows.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/windows.md b/docs/windows.md
index 69c2aa6d..dbfc1440 100644
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -23,6 +23,8 @@ Logs will often be helpful in diagnosing the problem (see
 * NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
 * AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card
 
+Ollama uses unicode characters for progress indication, which may render as unknown squares in some older terminal fonts in Windows 10. If you see this, try changing your terminal font settings.
+
 ## API Access
 
 Here's a quick example showing API access from `powershell`

From ce3c93b08f0b90496e86b9e0a5753334c2d21419 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Wed, 24 Jul 2024 17:09:20 -0700
Subject: [PATCH 31/79] Report better error on cuda unsupported os/arch

If we detect an NVIDIA GPU, but nvidia doesn't support the os/arch,
this will report a better error for the user and point them to docs
to self-install the drivers if possible.
---
 scripts/install.sh | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/scripts/install.sh b/scripts/install.sh
index 2a06c350..aa8b3e5e 100644
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -198,19 +198,29 @@ if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then
     exit 0
 fi
 
+CUDA_REPO_ERR_MSG="NVIDIA GPU detected, but your OS and Architecture are not supported by NVIDIA.  Please install the CUDA driver manually https://docs.nvidia.com/cuda/cuda-installation-guide-linux/"
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-7-centos-7
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-8-rocky-8
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-9-rocky-9
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#fedora
 install_cuda_driver_yum() {
     status 'Installing NVIDIA repository...'
+    
     case $PACKAGE_MANAGER in
         yum)
             $SUDO $PACKAGE_MANAGER -y install yum-utils
-            $SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
+            if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo" >/dev/null ; then
+                $SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
+            else
+                error $CUDA_REPO_ERR_MSG
+            fi
             ;;
         dnf)
-            $SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
+            if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo" >/dev/null ; then
+                $SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
+            else
+                error $CUDA_REPO_ERR_MSG
+            fi
             ;;
     esac
 
@@ -235,7 +245,11 @@ install_cuda_driver_yum() {
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#debian
 install_cuda_driver_apt() {
     status 'Installing NVIDIA repository...'
-    curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb
+    if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb" >/dev/null ; then
+        curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb
+    else
+        error $CUDA_REPO_ERR_MSG
+    fi
 
     case $1 in
         debian)

From bbf8f102ee06bd6b149e4999571c0844aa47b12f Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Thu, 25 Jul 2024 18:24:55 -0400
Subject: [PATCH 32/79] Revert "llm(llama): pass rope factors (#5924)" (#5963)

This reverts commit bb46bbcf5e90e5efab5ff946a6c798131907ba2d.
---
 llm/patches/0001-llama-3.1-rope-scaling.diff | 71 --------------------
 1 file changed, 71 deletions(-)
 delete mode 100644 llm/patches/0001-llama-3.1-rope-scaling.diff

diff --git a/llm/patches/0001-llama-3.1-rope-scaling.diff b/llm/patches/0001-llama-3.1-rope-scaling.diff
deleted file mode 100644
index 45dcb4f5..00000000
--- a/llm/patches/0001-llama-3.1-rope-scaling.diff
+++ /dev/null
@@ -1,71 +0,0 @@
-From 2f872f294fb6f5c6e8f983b68c40ea656053dd92 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Tue, 23 Jul 2024 14:33:29 -0700
-Subject: [PATCH] llama 3.1 rope scaling
-
----
- src/llama.cpp | 14 ++++++++++++--
- 1 file changed, 12 insertions(+), 2 deletions(-)
-
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 8fe51971..a9969df8 100644
---- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -2472,6 +2472,7 @@ struct llama_layer {
-     // long rope factors
-     struct ggml_tensor * rope_long  = nullptr;
-     struct ggml_tensor * rope_short = nullptr;
-+    struct ggml_tensor * rope_freqs = nullptr;
- 
-     // bitnet scale
-     struct ggml_tensor * wq_scale;
-@@ -6143,6 +6144,8 @@ static bool llm_load_tensors(
- 
-                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
- 
-+                        layer.rope_freqs  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS,  "weight"), { n_embd/n_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
-+
-                         if (n_expert == 0) {
-                             layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                             layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-@@ -8620,6 +8623,10 @@ struct llm_build_context {
-         // choose long/short freq factors based on the context size
-         const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
- 
-+        if (model.layers[il].rope_freqs != nullptr) {
-+            return model.layers[il].rope_freqs;
-+        }
-+
-         if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
-             return model.layers[il].rope_long;
-         }
-@@ -8814,6 +8821,9 @@ struct llm_build_context {
- 
-             // self-attention
-             {
-+                // rope freq factors for llama3; may return nullptr for llama2 and other models
-+                struct ggml_tensor * rope_factors = build_rope_factors(il);
-+
-                 // compute Q and K and RoPE them
-                 struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
-                 cb(Qcur, "Qcur", il);
-@@ -8837,14 +8847,14 @@ struct llm_build_context {
-                 }
- 
-                 Qcur = ggml_rope_ext(
--                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
-                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                     ext_factor, attn_factor, beta_fast, beta_slow
-                 );
-                 cb(Qcur, "Qcur", il);
- 
-                 Kcur = ggml_rope_ext(
--                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
-                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                     ext_factor, attn_factor, beta_fast, beta_slow
-                 );
--- 
-2.45.2
-

From 4de1370a9dcc88b79ddc2d4af2e8c954bdfa67a1 Mon Sep 17 00:00:00 2001
From: royjhan <65097070+royjhan@users.noreply.github.com>
Date: Thu, 25 Jul 2024 15:34:06 -0700
Subject: [PATCH 33/79] openai tools doc (#5617)

---
 docs/openai.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/openai.md b/docs/openai.md
index 248ba74a..e51d3194 100644
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -79,7 +79,7 @@ curl http://localhost:11434/v1/chat/completions \
 - [x] JSON mode
 - [x] Reproducible outputs
 - [ ] Vision
-- [ ] Function calling
+- [x] Tools
 - [ ] Logprobs
 
 #### Supported request fields
@@ -97,9 +97,9 @@ curl http://localhost:11434/v1/chat/completions \
 - [x] `temperature`
 - [x] `top_p`
 - [x] `max_tokens`
-- [ ] `logit_bias`
-- [ ] `tools`
+- [x] `tools`
 - [ ] `tool_choice`
+- [ ] `logit_bias`
 - [ ] `user`
 - [ ] `n`
 

From 455e61170d12d2b29ac2dfe5fa6444ae40a9ef7f Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Thu, 25 Jul 2024 18:34:47 -0400
Subject: [PATCH 34/79] Update openai.md

---
 docs/openai.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/openai.md b/docs/openai.md
index e51d3194..04d56bd6 100644
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -78,8 +78,8 @@ curl http://localhost:11434/v1/chat/completions \
 - [x] Streaming
 - [x] JSON mode
 - [x] Reproducible outputs
-- [ ] Vision
 - [x] Tools
+- [ ] Vision
 - [ ] Logprobs
 
 #### Supported request fields

From c8af3c2d969a99618eecf169bd75aa112573ac27 Mon Sep 17 00:00:00 2001
From: Blake Mizerany <blake.mizerany@gmail.com>
Date: Thu, 25 Jul 2024 15:58:30 -0700
Subject: [PATCH 35/79] server: reuse original download URL for images (#5962)

This changes the registry client to reuse the original download URL
it gets on the first redirect response for all subsequent requests,
preventing thundering herd issues when hot new LLMs are released.
---
 server/download.go | 75 +++++++++++++++++++++++++++++++++++++++++++++-
 server/images.go   |  6 +++-
 2 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/server/download.go b/server/download.go
index d93cd3b4..8b5b577f 100644
--- a/server/download.go
+++ b/server/download.go
@@ -8,6 +8,7 @@ import (
 	"io"
 	"log/slog"
 	"math"
+	"math/rand/v2"
 	"net/http"
 	"net/url"
 	"os"
@@ -141,6 +142,32 @@ func (b *blobDownload) Run(ctx context.Context, requestURL *url.URL, opts *regis
 	b.err = b.run(ctx, requestURL, opts)
 }
 
+func newBackoff(maxBackoff time.Duration) func(ctx context.Context) error {
+	var n int
+	return func(ctx context.Context) error {
+		if ctx.Err() != nil {
+			return ctx.Err()
+		}
+
+		n++
+
+		// n^2 backoff timer is a little smoother than the
+		// common choice of 2^n.
+		d := min(time.Duration(n*n)*10*time.Millisecond, maxBackoff)
+		// Randomize the delay between 0.5-1.5 x msec, in order
+		// to prevent accidental "thundering herd" problems.
+		d = time.Duration(float64(d) * (rand.Float64() + 0.5))
+		t := time.NewTimer(d)
+		defer t.Stop()
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-t.C:
+			return nil
+		}
+	}
+}
+
 func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *registryOptions) error {
 	defer blobDownloadManager.Delete(b.Digest)
 	ctx, b.CancelFunc = context.WithCancel(ctx)
@@ -153,6 +180,52 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 
 	_ = file.Truncate(b.Total)
 
+	directURL, err := func() (*url.URL, error) {
+		ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
+		defer cancel()
+
+		backoff := newBackoff(10 * time.Second)
+		for {
+			// shallow clone opts to be used in the closure
+			// without affecting the outer opts.
+			newOpts := new(registryOptions)
+			*newOpts = *opts
+
+			newOpts.CheckRedirect = func(req *http.Request, via []*http.Request) error {
+				if len(via) > 10 {
+					return errors.New("maxium redirects exceeded (10) for directURL")
+				}
+
+				// if the hostname is the same, allow the redirect
+				if req.URL.Hostname() == requestURL.Hostname() {
+					return nil
+				}
+
+				// stop at the first redirect that is not
+				// the same hostname as the original
+				// request.
+				return http.ErrUseLastResponse
+			}
+
+			resp, err := makeRequestWithRetry(ctx, http.MethodGet, requestURL, nil, nil, newOpts)
+			if err != nil {
+				slog.Warn("failed to get direct URL; backing off and retrying", "err", err)
+				if err := backoff(ctx); err != nil {
+					return nil, err
+				}
+				continue
+			}
+			defer resp.Body.Close()
+			if resp.StatusCode != http.StatusTemporaryRedirect {
+				return nil, fmt.Errorf("unexpected status code %d", resp.StatusCode)
+			}
+			return resp.Location()
+		}
+	}()
+	if err != nil {
+		return err
+	}
+
 	g, inner := errgroup.WithContext(ctx)
 	g.SetLimit(numDownloadParts)
 	for i := range b.Parts {
@@ -165,7 +238,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 			var err error
 			for try := 0; try < maxRetries; try++ {
 				w := io.NewOffsetWriter(file, part.StartsAt())
-				err = b.downloadChunk(inner, requestURL, w, part, opts)
+				err = b.downloadChunk(inner, directURL, w, part, opts)
 				switch {
 				case errors.Is(err, context.Canceled), errors.Is(err, syscall.ENOSPC):
 					// return immediately if the context is canceled or the device is out of space
diff --git a/server/images.go b/server/images.go
index 574dec19..836dbcc2 100644
--- a/server/images.go
+++ b/server/images.go
@@ -54,6 +54,8 @@ type registryOptions struct {
 	Username string
 	Password string
 	Token    string
+
+	CheckRedirect func(req *http.Request, via []*http.Request) error
 }
 
 type Model struct {
@@ -1131,7 +1133,9 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header
 		req.ContentLength = contentLength
 	}
 
-	resp, err := http.DefaultClient.Do(req)
+	resp, err := (&http.Client{
+		CheckRedirect: regOpts.CheckRedirect,
+	}).Do(req)
 	if err != nil {
 		return nil, err
 	}

From 997c903884b08aef53d0f92634f74bdb64f05c0a Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Thu, 25 Jul 2024 16:23:40 -0700
Subject: [PATCH 36/79] Update docs/template.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>
---
 docs/template.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/template.md b/docs/template.md
index 8f41e8fb..f6ce06ba 100644
--- a/docs/template.md
+++ b/docs/template.md
@@ -24,7 +24,7 @@ In this example, we have:
 * Three variables: `Messages`, `Role`, and `Content` (variables)
 * A custom function (action) that iterates over an array of items (`range .Messages`) and displays each item
 
-## Adding Templates to Your Model
+## Adding templates to your model
 
 By default, models imported into Ollama have a default template of `{{ .Prompt }}`, i.e. user inputs are sent verbatim to the LLM. This is appropriate for text or code completion models but lacks essential markers for chat or instruction models.
 

From ae27d9dcfd32b7fbaa0d5a1fb0126106873332bf Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Thu, 25 Jul 2024 20:27:33 -0400
Subject: [PATCH 37/79] Update openai.md

---
 docs/openai.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/openai.md b/docs/openai.md
index 04d56bd6..fee30f71 100644
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -78,7 +78,7 @@ curl http://localhost:11434/v1/chat/completions \
 - [x] Streaming
 - [x] JSON mode
 - [x] Reproducible outputs
-- [x] Tools
+- [x] Tools (streaming support coming soon)
 - [ ] Vision
 - [ ] Logprobs
 

From f5e3939220e9cd3d7a636708bc9df031ebfd4854 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Thu, 25 Jul 2024 23:10:18 -0400
Subject: [PATCH 38/79] Update api.md (#5968)

---
 docs/api.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/api.md b/docs/api.md
index 0ab70383..2d4fe28f 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -418,6 +418,7 @@ Generate the next message in a chat with a provided model. This is a streaming e
 
 - `model`: (required) the [model name](#model-names)
 - `messages`: the messages of the chat, this can be used to keep a chat memory
+- `tools`: tools for the model to use if supported. Requires `stream` to be set to `false`
 
 The `message` object has the following fields:
 
@@ -432,7 +433,6 @@ Advanced parameters (optional):
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
-- `tools`: external tools the model can use. Not all models support this feature.
 
 ### Examples
 
@@ -1286,4 +1286,4 @@ curl http://localhost:11434/api/embeddings -d '{
     0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281
   ]
 }
-```
\ No newline at end of file
+```

From 15af5584238c17ae21853e7619e8008078e6e792 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 19 Jun 2024 14:14:28 -0700
Subject: [PATCH 39/79] include modelfile messages

---
 cmd/cmd.go       |  1 -
 server/images.go |  7 +------
 server/routes.go | 31 ++++++++++++++++++-------------
 3 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index b761d018..641afafb 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -362,7 +362,6 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 
 	opts.MultiModal = slices.Contains(info.Details.Families, "clip")
 	opts.ParentModel = info.Details.ParentModel
-	opts.Messages = append(opts.Messages, info.Messages...)
 
 	if interactive {
 		return generateInteractive(cmd, opts)
diff --git a/server/images.go b/server/images.go
index 836dbcc2..0f616551 100644
--- a/server/images.go
+++ b/server/images.go
@@ -70,7 +70,7 @@ type Model struct {
 	License        []string
 	Digest         string
 	Options        map[string]interface{}
-	Messages       []Message
+	Messages       []api.Message
 
 	Template *template.Template
 }
@@ -191,11 +191,6 @@ func (m *Model) String() string {
 	return modelfile.String()
 }
 
-type Message struct {
-	Role    string `json:"role"`
-	Content string `json:"content"`
-}
-
 type ConfigV2 struct {
 	ModelFormat   string   `json:"model_format"`
 	ModelFamily   string   `json:"model_family"`
diff --git a/server/routes.go b/server/routes.go
index e6ffe526..2b4d5794 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -164,17 +164,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			}
 		}
 
-		var b bytes.Buffer
-		if req.Context != nil {
-			s, err := r.Detokenize(c.Request.Context(), req.Context)
-			if err != nil {
-				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
-				return
-			}
-
-			b.WriteString(s)
-		}
-
 		var values template.Values
 		if req.Suffix != "" {
 			values.Prompt = prompt
@@ -187,6 +176,10 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 				msgs = append(msgs, api.Message{Role: "system", Content: m.System})
 			}
 
+			if req.Context == nil {
+				msgs = append(msgs, m.Messages...)
+			}
+
 			for _, i := range images {
 				msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]", i.ID)})
 			}
@@ -194,11 +187,22 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})
 		}
 
+		var b bytes.Buffer
 		if err := tmpl.Execute(&b, values); err != nil {
 			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 			return
 		}
 
+		if req.Context != nil {
+			s, err := r.Detokenize(c.Request.Context(), req.Context)
+			if err != nil {
+				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+				return
+			}
+
+			b.WriteString(s)
+		}
+
 		prompt = b.String()
 	}
 
@@ -1323,11 +1327,12 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		return
 	}
 
+	msgs := append(m.Messages, req.Messages...)
 	if req.Messages[0].Role != "system" && m.System != "" {
-		req.Messages = append([]api.Message{{Role: "system", Content: m.System}}, req.Messages...)
+		msgs = append([]api.Message{{Role: "system", Content: m.System}}, msgs...)
 	}
 
-	prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, req.Messages, req.Tools)
+	prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools)
 	if err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return

From 3d9de805b777ca43746a6ae951b34689aa16e8e9 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Fri, 26 Jul 2024 13:19:01 -0700
Subject: [PATCH 40/79] fix: model save

stop parameter is saved as a slice which is incompatible with modelfile
parsing
---
 cmd/interactive.go      | 46 ++++++++++++++-----------
 cmd/interactive_test.go | 75 +++++++++++++++++++----------------------
 2 files changed, 60 insertions(+), 61 deletions(-)

diff --git a/cmd/interactive.go b/cmd/interactive.go
index adbc3e9f..2f83269e 100644
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -1,6 +1,7 @@
 package cmd
 
 import (
+	"cmp"
 	"errors"
 	"fmt"
 	"io"
@@ -9,13 +10,14 @@ import (
 	"path/filepath"
 	"regexp"
 	"slices"
-	"sort"
 	"strings"
 
 	"github.com/spf13/cobra"
+	"golang.org/x/exp/maps"
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/progress"
 	"github.com/ollama/ollama/readline"
 	"github.com/ollama/ollama/types/errtypes"
@@ -375,9 +377,9 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 					return err
 				}
 				req := &api.ShowRequest{
-					Name:     opts.Model,
-					System:   opts.System,
-					Options:  opts.Options,
+					Name:    opts.Model,
+					System:  opts.System,
+					Options: opts.Options,
 				}
 				resp, err := client.Show(cmd.Context(), req)
 				if err != nil {
@@ -506,31 +508,35 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 }
 
 func buildModelfile(opts runOptions) string {
-	var mf strings.Builder
-	model := opts.ParentModel
-	if model == "" {
-		model = opts.Model
-	}
-	fmt.Fprintf(&mf, "FROM %s\n", model)
+	var f parser.File
+	f.Commands = append(f.Commands, parser.Command{Name: "model", Args: cmp.Or(opts.ParentModel, opts.Model)})
+
 	if opts.System != "" {
-		fmt.Fprintf(&mf, "SYSTEM \"\"\"%s\"\"\"\n", opts.System)
+		f.Commands = append(f.Commands, parser.Command{Name: "system", Args: opts.System})
 	}
 
-	keys := make([]string, 0)
-	for k := range opts.Options {
-		keys = append(keys, k)
-	}
-	sort.Strings(keys)
+	keys := maps.Keys(opts.Options)
+	slices.Sort(keys)
 	for _, k := range keys {
-		fmt.Fprintf(&mf, "PARAMETER %s %v\n", k, opts.Options[k])
+		v := opts.Options[k]
+		var cmds []parser.Command
+		switch t := v.(type) {
+		case []string:
+			for _, s := range t {
+				cmds = append(cmds, parser.Command{Name: k, Args: s})
+			}
+		default:
+			cmds = append(cmds, parser.Command{Name: k, Args: fmt.Sprintf("%v", t)})
+		}
+
+		f.Commands = append(f.Commands, cmds...)
 	}
-	fmt.Fprintln(&mf)
 
 	for _, msg := range opts.Messages {
-		fmt.Fprintf(&mf, "MESSAGE %s \"\"\"%s\"\"\"\n", msg.Role, msg.Content)
+		f.Commands = append(f.Commands, parser.Command{Name: "message", Args: fmt.Sprintf("%s: %s", msg.Role, msg.Content)})
 	}
 
-	return mf.String()
+	return f.String()
 }
 
 func normalizeFilePath(fp string) string {
diff --git a/cmd/interactive_test.go b/cmd/interactive_test.go
index 711f3860..bb7e0aba 100644
--- a/cmd/interactive_test.go
+++ b/cmd/interactive_test.go
@@ -1,12 +1,10 @@
 package cmd
 
 import (
-	"bytes"
 	"testing"
-	"text/template"
 
+	"github.com/google/go-cmp/cmp"
 	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
 
 	"github.com/ollama/ollama/api"
 )
@@ -57,58 +55,53 @@ d:\path with\spaces\seven.svg inbetween7 c:\users\jdoe\eight.png inbetween8
 
 func TestModelfileBuilder(t *testing.T) {
 	opts := runOptions{
-		Model:    "hork",
-		System:   "You are part horse and part shark, but all hork. Do horklike things",
+		Model:  "hork",
+		System: "You are part horse and part shark, but all hork. Do horklike things",
 		Messages: []api.Message{
 			{Role: "user", Content: "Hey there hork!"},
 			{Role: "assistant", Content: "Yes it is true, I am half horse, half shark."},
 		},
-		Options: map[string]interface{}{},
+		Options: map[string]any{
+			"temperature":      0.9,
+			"seed":             42,
+			"penalize_newline": false,
+			"stop":             []string{"hi", "there"},
+		},
 	}
 
-	opts.Options["temperature"] = 0.9
-	opts.Options["seed"] = 42
-	opts.Options["penalize_newline"] = false
-	opts.Options["stop"] = []string{"hi", "there"}
-
-	mf := buildModelfile(opts)
-	expectedModelfile := `FROM {{.Model}}
-SYSTEM """{{.System}}"""
+	t.Run("model", func(t *testing.T) {
+		expect := `FROM hork
+SYSTEM You are part horse and part shark, but all hork. Do horklike things
 PARAMETER penalize_newline false
 PARAMETER seed 42
-PARAMETER stop [hi there]
+PARAMETER stop hi
+PARAMETER stop there
 PARAMETER temperature 0.9
-
-MESSAGE user """Hey there hork!"""
-MESSAGE assistant """Yes it is true, I am half horse, half shark."""
+MESSAGE user Hey there hork!
+MESSAGE assistant Yes it is true, I am half horse, half shark.
 `
 
-	tmpl, err := template.New("").Parse(expectedModelfile)
-	require.NoError(t, err)
+		actual := buildModelfile(opts)
+		if diff := cmp.Diff(expect, actual); diff != "" {
+			t.Errorf("mismatch (-want +got):\n%s", diff)
+		}
+	})
 
-	var buf bytes.Buffer
-	err = tmpl.Execute(&buf, opts)
-	require.NoError(t, err)
-	assert.Equal(t, buf.String(), mf)
-
-	opts.ParentModel = "horseshark"
-	mf = buildModelfile(opts)
-	expectedModelfile = `FROM {{.ParentModel}}
-SYSTEM """{{.System}}"""
+	t.Run("parent model", func(t *testing.T) {
+		opts.ParentModel = "horseshark"
+		expect := `FROM horseshark
+SYSTEM You are part horse and part shark, but all hork. Do horklike things
 PARAMETER penalize_newline false
 PARAMETER seed 42
-PARAMETER stop [hi there]
+PARAMETER stop hi
+PARAMETER stop there
 PARAMETER temperature 0.9
-
-MESSAGE user """Hey there hork!"""
-MESSAGE assistant """Yes it is true, I am half horse, half shark."""
+MESSAGE user Hey there hork!
+MESSAGE assistant Yes it is true, I am half horse, half shark.
 `
-
-	tmpl, err = template.New("").Parse(expectedModelfile)
-	require.NoError(t, err)
-
-	var parentBuf bytes.Buffer
-	err = tmpl.Execute(&parentBuf, opts)
-	require.NoError(t, err)
-	assert.Equal(t, parentBuf.String(), mf)
+		actual := buildModelfile(opts)
+		if diff := cmp.Diff(expect, actual); diff != "" {
+			t.Errorf("mismatch (-want +got):\n%s", diff)
+		}
+	})
 }

From a250c2cb13fd74b516dd138daad9ca54e30a9fab Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Fri, 26 Jul 2024 13:39:38 -0700
Subject: [PATCH 41/79] display messages

---
 cmd/cmd.go         | 16 ++++++++++++++++
 cmd/interactive.go | 27 ++++-----------------------
 2 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index 641afafb..22950885 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -364,6 +364,22 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	opts.ParentModel = info.Details.ParentModel
 
 	if interactive {
+		if err := loadModel(cmd, &opts); err != nil {
+			return err
+		}
+
+		for _, msg := range info.Messages {
+			switch msg.Role {
+			case "user":
+				fmt.Printf(">>> %s\n", msg.Content)
+			case "assistant":
+				state := &displayResponseState{}
+				displayResponse(msg.Content, opts.WordWrap, state)
+				fmt.Println()
+				fmt.Println()
+			}
+		}
+
 		return generateInteractive(cmd, opts)
 	}
 	return generate(cmd, opts)
diff --git a/cmd/interactive.go b/cmd/interactive.go
index adbc3e9f..41b19971 100644
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -46,29 +46,10 @@ func loadModel(cmd *cobra.Command, opts *runOptions) error {
 		KeepAlive: opts.KeepAlive,
 	}
 
-	return client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
-		p.StopAndClear()
-		for _, msg := range opts.Messages {
-			switch msg.Role {
-			case "user":
-				fmt.Printf(">>> %s\n", msg.Content)
-			case "assistant":
-				state := &displayResponseState{}
-				displayResponse(msg.Content, opts.WordWrap, state)
-				fmt.Println()
-				fmt.Println()
-			}
-		}
-		return nil
-	})
+	return client.Chat(cmd.Context(), chatReq, func(api.ChatResponse) error { return nil })
 }
 
 func generateInteractive(cmd *cobra.Command, opts runOptions) error {
-	err := loadModel(cmd, &opts)
-	if err != nil {
-		return err
-	}
-
 	usage := func() {
 		fmt.Fprintln(os.Stderr, "Available Commands:")
 		fmt.Fprintln(os.Stderr, "  /set            Set session variables")
@@ -375,9 +356,9 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 					return err
 				}
 				req := &api.ShowRequest{
-					Name:     opts.Model,
-					System:   opts.System,
-					Options:  opts.Options,
+					Name:    opts.Model,
+					System:  opts.System,
+					Options: opts.Options,
 				}
 				resp, err := client.Show(cmd.Context(), req)
 				if err != nil {

From a622c47bd32e4c7d8d6cd12ba8c7556fcc492524 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Fri, 26 Jul 2024 14:10:18 -0700
Subject: [PATCH 42/79] fix nil deref in auth.go

---
 server/auth.go   | 2 +-
 server/upload.go | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/server/auth.go b/server/auth.go
index e92a5b65..dcef5bf9 100644
--- a/server/auth.go
+++ b/server/auth.go
@@ -67,7 +67,7 @@ func getAuthorizationToken(ctx context.Context, challenge registryChallenge) (st
 
 	headers.Add("Authorization", signature)
 
-	response, err := makeRequest(ctx, http.MethodGet, redirectURL, headers, nil, nil)
+	response, err := makeRequest(ctx, http.MethodGet, redirectURL, headers, nil, &registryOptions{})
 	if err != nil {
 		return "", err
 	}
diff --git a/server/upload.go b/server/upload.go
index 73ce78ce..c4078c22 100644
--- a/server/upload.go
+++ b/server/upload.go
@@ -254,7 +254,7 @@ func (b *blobUpload) uploadPart(ctx context.Context, method string, requestURL *
 
 		// retry uploading to the redirect URL
 		for try := range maxRetries {
-			err = b.uploadPart(ctx, http.MethodPut, redirectURL, part, nil)
+			err = b.uploadPart(ctx, http.MethodPut, redirectURL, part, &registryOptions{})
 			switch {
 			case errors.Is(err, context.Canceled):
 				return err

From 750c1c55f7ea65219e4e24d6107a4a3ad519b53f Mon Sep 17 00:00:00 2001
From: Blake Mizerany <blake.mizerany@gmail.com>
Date: Fri, 26 Jul 2024 14:24:24 -0700
Subject: [PATCH 43/79] server: fix race conditions during download (#5994)

This fixes various data races scattered throughout the download/pull
client where the client was accessing the download state concurrently.

This commit is mostly a hot-fix and will be replaced by a new client one
day soon.

Also, remove the unnecessary opts argument from downloadChunk.
---
 server/download.go | 59 ++++++++++++++++++++++++++++------------------
 1 file changed, 36 insertions(+), 23 deletions(-)

diff --git a/server/download.go b/server/download.go
index 8b5b577f..45483ba6 100644
--- a/server/download.go
+++ b/server/download.go
@@ -44,17 +44,19 @@ type blobDownload struct {
 
 	context.CancelFunc
 
-	done       bool
+	done       chan struct{}
 	err        error
 	references atomic.Int32
 }
 
 type blobDownloadPart struct {
-	N           int
-	Offset      int64
-	Size        int64
-	Completed   int64
-	lastUpdated time.Time
+	N         int
+	Offset    int64
+	Size      int64
+	Completed atomic.Int64
+
+	lastUpdatedMu sync.Mutex
+	lastUpdated   time.Time
 
 	*blobDownload `json:"-"`
 }
@@ -72,7 +74,7 @@ func (p *blobDownloadPart) Name() string {
 }
 
 func (p *blobDownloadPart) StartsAt() int64 {
-	return p.Offset + p.Completed
+	return p.Offset + p.Completed.Load()
 }
 
 func (p *blobDownloadPart) StopsAt() int64 {
@@ -82,7 +84,9 @@ func (p *blobDownloadPart) StopsAt() int64 {
 func (p *blobDownloadPart) Write(b []byte) (n int, err error) {
 	n = len(b)
 	p.blobDownload.Completed.Add(int64(n))
+	p.lastUpdatedMu.Lock()
 	p.lastUpdated = time.Now()
+	p.lastUpdatedMu.Unlock()
 	return n, nil
 }
 
@@ -92,6 +96,8 @@ func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *r
 		return err
 	}
 
+	b.done = make(chan struct{})
+
 	for _, partFilePath := range partFilePaths {
 		part, err := b.readPart(partFilePath)
 		if err != nil {
@@ -99,7 +105,7 @@ func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *r
 		}
 
 		b.Total += part.Size
-		b.Completed.Add(part.Completed)
+		b.Completed.Add(part.Completed.Load())
 		b.Parts = append(b.Parts, part)
 	}
 
@@ -139,6 +145,7 @@ func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *r
 }
 
 func (b *blobDownload) Run(ctx context.Context, requestURL *url.URL, opts *registryOptions) {
+	defer close(b.done)
 	b.err = b.run(ctx, requestURL, opts)
 }
 
@@ -230,7 +237,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 	g.SetLimit(numDownloadParts)
 	for i := range b.Parts {
 		part := b.Parts[i]
-		if part.Completed == part.Size {
+		if part.Completed.Load() == part.Size {
 			continue
 		}
 
@@ -238,7 +245,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 			var err error
 			for try := 0; try < maxRetries; try++ {
 				w := io.NewOffsetWriter(file, part.StartsAt())
-				err = b.downloadChunk(inner, directURL, w, part, opts)
+				err = b.downloadChunk(inner, directURL, w, part)
 				switch {
 				case errors.Is(err, context.Canceled), errors.Is(err, syscall.ENOSPC):
 					// return immediately if the context is canceled or the device is out of space
@@ -279,29 +286,31 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 		return err
 	}
 
-	b.done = true
 	return nil
 }
 
-func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w io.Writer, part *blobDownloadPart, opts *registryOptions) error {
+func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w io.Writer, part *blobDownloadPart) error {
 	g, ctx := errgroup.WithContext(ctx)
 	g.Go(func() error {
-		headers := make(http.Header)
-		headers.Set("Range", fmt.Sprintf("bytes=%d-%d", part.StartsAt(), part.StopsAt()-1))
-		resp, err := makeRequestWithRetry(ctx, http.MethodGet, requestURL, headers, nil, opts)
+		req, err := http.NewRequestWithContext(ctx, http.MethodGet, requestURL.String(), nil)
+		if err != nil {
+			return err
+		}
+		req.Header.Set("Range", fmt.Sprintf("bytes=%d-%d", part.StartsAt(), part.StopsAt()-1))
+		resp, err := http.DefaultClient.Do(req)
 		if err != nil {
 			return err
 		}
 		defer resp.Body.Close()
 
-		n, err := io.CopyN(w, io.TeeReader(resp.Body, part), part.Size-part.Completed)
+		n, err := io.CopyN(w, io.TeeReader(resp.Body, part), part.Size-part.Completed.Load())
 		if err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, io.ErrUnexpectedEOF) {
 			// rollback progress
 			b.Completed.Add(-n)
 			return err
 		}
 
-		part.Completed += n
+		part.Completed.Add(n)
 		if err := b.writePart(part.Name(), part); err != nil {
 			return err
 		}
@@ -315,15 +324,21 @@ func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w
 		for {
 			select {
 			case <-ticker.C:
-				if part.Completed >= part.Size {
+				if part.Completed.Load() >= part.Size {
 					return nil
 				}
 
-				if !part.lastUpdated.IsZero() && time.Since(part.lastUpdated) > 5*time.Second {
+				part.lastUpdatedMu.Lock()
+				lastUpdated := part.lastUpdated
+				part.lastUpdatedMu.Unlock()
+
+				if !lastUpdated.IsZero() && time.Since(lastUpdated) > 5*time.Second {
 					const msg = "%s part %d stalled; retrying. If this persists, press ctrl-c to exit, then 'ollama pull' to find a faster connection."
 					slog.Info(fmt.Sprintf(msg, b.Digest[7:19], part.N))
 					// reset last updated
+					part.lastUpdatedMu.Lock()
 					part.lastUpdated = time.Time{}
+					part.lastUpdatedMu.Unlock()
 					return errPartStalled
 				}
 			case <-ctx.Done():
@@ -388,6 +403,8 @@ func (b *blobDownload) Wait(ctx context.Context, fn func(api.ProgressResponse))
 	ticker := time.NewTicker(60 * time.Millisecond)
 	for {
 		select {
+		case <-b.done:
+			return b.err
 		case <-ticker.C:
 			fn(api.ProgressResponse{
 				Status:    fmt.Sprintf("pulling %s", b.Digest[7:19]),
@@ -395,10 +412,6 @@ func (b *blobDownload) Wait(ctx context.Context, fn func(api.ProgressResponse))
 				Total:     b.Total,
 				Completed: b.Completed.Load(),
 			})
-
-			if b.done || b.err != nil {
-				return b.err
-			}
 		case <-ctx.Done():
 			return ctx.Err()
 		}

From f2a96c7d778249a7f911471b6a1532339e42fcf5 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Fri, 26 Jul 2024 18:20:52 -0400
Subject: [PATCH 44/79] llm: keep patch for llama 3 rope factors (#5987)

---
 llm/patches/10-llama3-rope.diff | 70 +++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 llm/patches/10-llama3-rope.diff

diff --git a/llm/patches/10-llama3-rope.diff b/llm/patches/10-llama3-rope.diff
new file mode 100644
index 00000000..39f38fea
--- /dev/null
+++ b/llm/patches/10-llama3-rope.diff
@@ -0,0 +1,70 @@
+From 2f872f294fb6f5c6e8f983b68c40ea656053dd92 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Tue, 23 Jul 2024 14:33:29 -0700
+Subject: [PATCH] llama 3.1 rope scaling
+
+---
+ src/llama.cpp | 14 ++++++++++++--
+ 1 file changed, 12 insertions(+), 2 deletions(-)
+
+diff --git a/src/llama.cpp b/src/llama.cpp
+index 8fe51971..a9969df8 100644
+--- a/src/llama.cpp
++++ b/src/llama.cpp
+@@ -2472,6 +2472,7 @@ struct llama_layer {
+     // long rope factors
+     struct ggml_tensor * rope_long  = nullptr;
+     struct ggml_tensor * rope_short = nullptr;
++    struct ggml_tensor * rope_freqs = nullptr;
+ 
+     // bitnet scale
+     struct ggml_tensor * wq_scale;
+@@ -6143,6 +6144,8 @@ static bool llm_load_tensors(
+ 
+                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ 
++                        layer.rope_freqs  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS,  "weight"), { n_embd/n_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
++
+                         if (n_expert == 0) {
+                             layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                             layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+@@ -8620,6 +8623,10 @@ struct llm_build_context {
+         // choose long/short freq factors based on the context size
+         const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
+ 
++        if (model.layers[il].rope_freqs != nullptr) {
++            return model.layers[il].rope_freqs;
++        }
++
+         if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
+             return model.layers[il].rope_long;
+         }
+@@ -8814,6 +8821,9 @@ struct llm_build_context {
+ 
+             // self-attention
+             {
++                // rope freq factors for llama3; may return nullptr for llama2 and other models
++                struct ggml_tensor * rope_factors = build_rope_factors(il);
++
+                 // compute Q and K and RoPE them
+                 struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                 cb(Qcur, "Qcur", il);
+@@ -8837,14 +8847,14 @@ struct llm_build_context {
+                 }
+ 
+                 Qcur = ggml_rope_ext(
+-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
++                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
+                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                     ext_factor, attn_factor, beta_fast, beta_slow
+                 );
+                 cb(Qcur, "Qcur", il);
+ 
+                 Kcur = ggml_rope_ext(
+-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
++                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
+                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                     ext_factor, attn_factor, beta_fast, beta_slow
+                 );
+-- 
+2.45.2

From f3d7a481b75e0af89ae946d3923a239a3d835643 Mon Sep 17 00:00:00 2001
From: Tibor Schmidt <robitx@gmail.com>
Date: Sat, 27 Jul 2024 23:37:40 +0200
Subject: [PATCH 45/79] feat: add support for min_p (resolve #1142) (#1825)

---
 api/types.go          | 1 +
 cmd/interactive.go    | 1 +
 docs/api.md           | 1 +
 docs/modelfile.md     | 1 +
 llm/server.go         | 1 +
 parser/parser_test.go | 1 +
 6 files changed, 6 insertions(+)

diff --git a/api/types.go b/api/types.go
index 65a99c76..35121813 100644
--- a/api/types.go
+++ b/api/types.go
@@ -209,6 +209,7 @@ type Options struct {
 	NumPredict       int      `json:"num_predict,omitempty"`
 	TopK             int      `json:"top_k,omitempty"`
 	TopP             float32  `json:"top_p,omitempty"`
+	MinP             float32  `json:"min_p,omitempty"`
 	TFSZ             float32  `json:"tfs_z,omitempty"`
 	TypicalP         float32  `json:"typical_p,omitempty"`
 	RepeatLastN      int      `json:"repeat_last_n,omitempty"`
diff --git a/cmd/interactive.go b/cmd/interactive.go
index adbc3e9f..c3cdf629 100644
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -138,6 +138,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "  /set parameter num_predict <int>      Max number of tokens to predict")
 		fmt.Fprintln(os.Stderr, "  /set parameter top_k <int>            Pick from top k num of tokens")
 		fmt.Fprintln(os.Stderr, "  /set parameter top_p <float>          Pick token based on sum of probabilities")
+		fmt.Fprintln(os.Stderr, "  /set parameter min_p <float>          Pick token based on top token probability * min_p")
 		fmt.Fprintln(os.Stderr, "  /set parameter num_ctx <int>          Set the context size")
 		fmt.Fprintln(os.Stderr, "  /set parameter temperature <float>    Set creativity level")
 		fmt.Fprintln(os.Stderr, "  /set parameter repeat_penalty <float> How strongly to penalize repetitions")
diff --git a/docs/api.md b/docs/api.md
index 2d4fe28f..90b41f3e 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -336,6 +336,7 @@ curl http://localhost:11434/api/generate -d '{
     "num_predict": 100,
     "top_k": 20,
     "top_p": 0.9,
+    "min_p": 0.0,
     "tfs_z": 0.5,
     "typical_p": 0.7,
     "repeat_last_n": 33,
diff --git a/docs/modelfile.md b/docs/modelfile.md
index c3645b06..852bf96c 100644
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -141,6 +141,7 @@ PARAMETER <parameter> <parametervalue>
 | num_predict    | Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context)                                                                                                                                   | int        | num_predict 42       |
 | top_k          | Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)                                                                        | int        | top_k 40             |
 | top_p          | Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)                                                                 | float      | top_p 0.9            |
+| min_p          | Alternative to the top_p, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out. (Default: 0.0) | float      | min_p 0.05            |
 
 ### TEMPLATE
 
diff --git a/llm/server.go b/llm/server.go
index 55732773..8127960f 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -727,6 +727,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 		"temperature":       req.Options.Temperature,
 		"top_k":             req.Options.TopK,
 		"top_p":             req.Options.TopP,
+		"min_p":             req.Options.MinP,
 		"tfs_z":             req.Options.TFSZ,
 		"typical_p":         req.Options.TypicalP,
 		"repeat_last_n":     req.Options.RepeatLastN,
diff --git a/parser/parser_test.go b/parser/parser_test.go
index 2b5c4c88..48044bc0 100644
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@@ -451,6 +451,7 @@ func TestParseFileParameters(t *testing.T) {
 		"num_predict 1":                {"num_predict", "1"},
 		"top_k 1":                      {"top_k", "1"},
 		"top_p 1.0":                    {"top_p", "1.0"},
+		"min_p 0.05":                   {"min_p", "0.05"},
 		"tfs_z 1.0":                    {"tfs_z", "1.0"},
 		"typical_p 1.0":                {"typical_p", "1.0"},
 		"repeat_last_n 1":              {"repeat_last_n", "1"},

From 2c01610616074ef631ba5248f226099547ee7f57 Mon Sep 17 00:00:00 2001
From: Michael <mchiang0610@users.noreply.github.com>
Date: Sun, 28 Jul 2024 17:21:38 -0400
Subject: [PATCH 46/79] update readme to llama3.1 (#5933)

---
 README.md | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index e7b12943..65c3a013 100644
--- a/README.md
+++ b/README.md
@@ -35,10 +35,10 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla
 
 ## Quickstart
 
-To run and chat with [Llama 3](https://ollama.com/library/llama3):
+To run and chat with [Llama 3.1](https://ollama.com/library/llama3.1):
 
 ```
-ollama run llama3
+ollama run llama3.1
 ```
 
 ## Model library
@@ -49,8 +49,9 @@ Here are some example models that can be downloaded:
 
 | Model              | Parameters | Size  | Download                       |
 | ------------------ | ---------- | ----- | ------------------------------ |
-| Llama 3            | 8B         | 4.7GB | `ollama run llama3`            |
-| Llama 3            | 70B        | 40GB  | `ollama run llama3:70b`        |
+| Llama 3.1          | 8B         | 4.7GB | `ollama run llama3.1`          |
+| Llama 3.1          | 70B        | 40GB  | `ollama run llama3.1:70b`      |
+| Llama 3.1          | 405B       | 231GB | `ollama run llama3.1:405b`     |
 | Phi 3 Mini         | 3.8B       | 2.3GB | `ollama run phi3`              |
 | Phi 3 Medium       | 14B        | 7.9GB | `ollama run phi3:medium`       |
 | Gemma 2            | 9B         | 5.5GB | `ollama run gemma2`            |
@@ -97,16 +98,16 @@ See the [guide](docs/import.md) on importing models for more information.
 
 ### Customize a prompt
 
-Models from the Ollama library can be customized with a prompt. For example, to customize the `llama3` model:
+Models from the Ollama library can be customized with a prompt. For example, to customize the `llama3.1` model:
 
 ```
-ollama pull llama3
+ollama pull llama3.1
 ```
 
 Create a `Modelfile`:
 
 ```
-FROM llama3
+FROM llama3.1
 
 # set the temperature to 1 [higher is more creative, lower is more coherent]
 PARAMETER temperature 1
@@ -141,7 +142,7 @@ ollama create mymodel -f ./Modelfile
 ### Pull a model
 
 ```
-ollama pull llama3
+ollama pull llama3.1
 ```
 
 > This command can also be used to update a local model. Only the diff will be pulled.
@@ -149,13 +150,13 @@ ollama pull llama3
 ### Remove a model
 
 ```
-ollama rm llama3
+ollama rm llama3.1
 ```
 
 ### Copy a model
 
 ```
-ollama cp llama3 my-model
+ollama cp llama3.1 my-model
 ```
 
 ### Multiline input
@@ -179,14 +180,14 @@ The image features a yellow smiley face, which is likely the central focus of th
 ### Pass the prompt as an argument
 
 ```
-$ ollama run llama3 "Summarize this file: $(cat README.md)"
+$ ollama run llama3.1 "Summarize this file: $(cat README.md)"
  Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
 ```
 
 ### Show model information
 
 ```
-ollama show llama3
+ollama show llama3.1
 ```
 
 ### List models on your computer
@@ -214,7 +215,7 @@ Next, start the server:
 Finally, in a separate shell, run a model:
 
 ```
-./ollama run llama3
+./ollama run llama3.1
 ```
 
 ## REST API
@@ -225,7 +226,7 @@ Ollama has a REST API for running and managing models.
 
 ```
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3",
+  "model": "llama3.1",
   "prompt":"Why is the sky blue?"
 }'
 ```
@@ -234,7 +235,7 @@ curl http://localhost:11434/api/generate -d '{
 
 ```
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3",
+  "model": "llama3.1",
   "messages": [
     { "role": "user", "content": "why is the sky blue?" }
   ]

From 0e4d653687f81db40622e287a846245b319f1fbe Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Sun, 28 Jul 2024 19:56:02 -0700
Subject: [PATCH 47/79] upate to `llama3.1` elsewhere in repo (#6032)

---
 app/ollama.iss                | 2 +-
 app/ollama_welcome.ps1        | 2 +-
 docs/docker.md                | 2 +-
 docs/faq.md                   | 2 +-
 docs/tutorials/langchainjs.md | 4 ++--
 macapp/src/app.tsx            | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/app/ollama.iss b/app/ollama.iss
index 6bedb9ff..dc6178f7 100644
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -138,7 +138,7 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi
 
 
 ;FinishedHeadingLabel=Run your first model
-;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n    ollama run llama3
+;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n    ollama run llama3.1
 ;ClickFinish=%n
 
 [Registry]
diff --git a/app/ollama_welcome.ps1 b/app/ollama_welcome.ps1
index 9af37a46..46777a3a 100644
--- a/app/ollama_welcome.ps1
+++ b/app/ollama_welcome.ps1
@@ -4,5 +4,5 @@ write-host "Welcome to Ollama!"
 write-host ""
 write-host "Run your first model:"
 write-host ""
-write-host "`tollama run llama3"
+write-host "`tollama run llama3.1"
 write-host ""
\ No newline at end of file
diff --git a/docs/docker.md b/docs/docker.md
index 0b58562b..a34c3291 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -63,7 +63,7 @@ docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 114
 Now you can run a model:
 
 ```
-docker exec -it ollama ollama run llama3
+docker exec -it ollama ollama run llama3.1
 ```
 
 ### Try different models
diff --git a/docs/faq.md b/docs/faq.md
index da1848f7..f2f32af4 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -227,7 +227,7 @@ curl http://localhost:11434/api/chat -d '{"model": "mistral"}'
 
 To preload a model using the CLI, use the command:
 ```shell
-ollama run llama3 ""
+ollama run llama3.1 ""
 ```
 
 ## How do I keep a model loaded in memory or make it unload immediately?
diff --git a/docs/tutorials/langchainjs.md b/docs/tutorials/langchainjs.md
index 4d60afb6..f925869b 100644
--- a/docs/tutorials/langchainjs.md
+++ b/docs/tutorials/langchainjs.md
@@ -15,7 +15,7 @@ import { Ollama } from "@langchain/community/llms/ollama";
 
 const ollama = new Ollama({
   baseUrl: "http://localhost:11434",
-  model: "llama3",
+  model: "llama3.1",
 });
 
 const answer = await ollama.invoke(`why is the sky blue?`);
@@ -23,7 +23,7 @@ const answer = await ollama.invoke(`why is the sky blue?`);
 console.log(answer);
 ```
 
-That will get us the same thing as if we ran `ollama run llama3 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
+That will get us the same thing as if we ran `ollama run llama3.1 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
 
 ```bash
 npm install cheerio
diff --git a/macapp/src/app.tsx b/macapp/src/app.tsx
index ab17df60..a627e63d 100644
--- a/macapp/src/app.tsx
+++ b/macapp/src/app.tsx
@@ -19,7 +19,7 @@ export default function () {
   const [step, setStep] = useState<Step>(Step.WELCOME)
   const [commandCopied, setCommandCopied] = useState<boolean>(false)
 
-  const command = 'ollama run llama3'
+  const command = 'ollama run llama3.1'
 
   return (
     <div className='drag'>

From 6f26e9322fd4639b4e414f8890b0213783e74d7c Mon Sep 17 00:00:00 2001
From: Veit Heller <veit@veitheller.de>
Date: Mon, 29 Jul 2024 17:50:53 +0200
Subject: [PATCH 48/79] Fix typo in image docs (#6041)

---
 docs/api.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/api.md b/docs/api.md
index 90b41f3e..c0202ef3 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -587,7 +587,7 @@ Final response:
 
 ##### Request
 
-Send a chat message with a conversation history.
+Send a chat message with images. The images should be provided as an array, with the individual images encoded in Base64.
 
 ```shell
 curl http://localhost:11434/api/chat -d '{

From f26aef9a8bfdd3e0f0d13cafe8bd371f29d9d877 Mon Sep 17 00:00:00 2001
From: Ikko Eltociear Ashimine <eltociear@gmail.com>
Date: Tue, 30 Jul 2024 02:53:30 +0900
Subject: [PATCH 49/79] docs: update README.md (#6059)

HuggingFace -> Hugging Face
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 65c3a013..824b3761 100644
--- a/README.md
+++ b/README.md
@@ -390,7 +390,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
 - [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use ollama as a copilot like Github copilot)
 - [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
-- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace)
+- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and Hugging Face)
 - [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
 - [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
 - [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)

From 68ee42f995a04bd163eb1c714f53d4c25ab25474 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Mon, 29 Jul 2024 13:20:26 -0700
Subject: [PATCH 50/79] update llama.cpp submodule to `6eeaeba1` (#6039)

---
 llm/ext_server/server.cpp                |  9 ---
 llm/llama.cpp                            |  2 +-
 llm/patches/05-default-pretokenizer.diff | 10 ++--
 llm/patches/09-lora.diff                 |  6 +-
 llm/patches/10-llama3-rope.diff          | 70 ------------------------
 5 files changed, 8 insertions(+), 89 deletions(-)
 delete mode 100644 llm/patches/10-llama3-rope.diff

diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp
index 14d921c0..0d51460c 100644
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -2438,15 +2438,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
             params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
             params.use_mmap = false;
         }
-        else if (arg == "--lora-base")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.lora_base = argv[i];
-        }
         else if (arg == "-v" || arg == "--verbose")
         {
             server_verbose = true;
diff --git a/llm/llama.cpp b/llm/llama.cpp
index d94c6e0c..6eeaeba1 160000
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
@@ -1 +1 @@
-Subproject commit d94c6e0ccbd29ee1ba4f44e9caa8682ad94df9fa
+Subproject commit 6eeaeba126ff701f3e8f79f246805b7023709972
diff --git a/llm/patches/05-default-pretokenizer.diff b/llm/patches/05-default-pretokenizer.diff
index 646bc49c..0d40fc3c 100644
--- a/llm/patches/05-default-pretokenizer.diff
+++ b/llm/patches/05-default-pretokenizer.diff
@@ -1,8 +1,8 @@
 diff --git a/src/llama.cpp b/src/llama.cpp
-index 8fe51971..7113ba64 100644
+index a207451f..2ddf431d 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -5433,16 +5433,7 @@ static void llm_load_vocab(
+@@ -5347,16 +5347,7 @@ static void llm_load_vocab(
          if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
              vocab.tokenizer_add_space_prefix = false;
              vocab.tokenizer_clean_spaces = true;
@@ -20,9 +20,9 @@ index 8fe51971..7113ba64 100644
                  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
              } else if (
                      tokenizer_pre == "llama3"   ||
-@@ -5526,7 +5517,8 @@ static void llm_load_vocab(
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
-                 vocab.tokenizer_clean_spaces = false;
+@@ -5443,7 +5434,8 @@ static void llm_load_vocab(
+                 tokenizer_pre == "codeshell") {
+                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
              } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
diff --git a/llm/patches/09-lora.diff b/llm/patches/09-lora.diff
index fc1017a6..10c66d1d 100644
--- a/llm/patches/09-lora.diff
+++ b/llm/patches/09-lora.diff
@@ -2,7 +2,7 @@ diff --git a/common/common.cpp b/common/common.cpp
 index dbb724fb..c26fe6ee 100644
 --- a/common/common.cpp
 +++ b/common/common.cpp
-@@ -2087,14 +2087,29 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
+@@ -2087,14 +2087,27 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
      for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
          const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
          float lora_scale = std::get<1>(params.lora_adapter[i]);
@@ -20,9 +20,7 @@ index dbb724fb..c26fe6ee 100644
 +            int err = llama_model_apply_lora_from_file(model,
 +                                                    lora_adapter.c_str(),
 +                                                    lora_scale,
-+                                                    ((i > 0) || params.lora_base.empty())
-+                                                        ? NULL
-+                                                        : params.lora_base.c_str(),
++                                                    nullptr,
 +                                                    params.n_threads);
 +            if (err != 0) {
 +                fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
diff --git a/llm/patches/10-llama3-rope.diff b/llm/patches/10-llama3-rope.diff
deleted file mode 100644
index 39f38fea..00000000
--- a/llm/patches/10-llama3-rope.diff
+++ /dev/null
@@ -1,70 +0,0 @@
-From 2f872f294fb6f5c6e8f983b68c40ea656053dd92 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Tue, 23 Jul 2024 14:33:29 -0700
-Subject: [PATCH] llama 3.1 rope scaling
-
----
- src/llama.cpp | 14 ++++++++++++--
- 1 file changed, 12 insertions(+), 2 deletions(-)
-
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 8fe51971..a9969df8 100644
---- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -2472,6 +2472,7 @@ struct llama_layer {
-     // long rope factors
-     struct ggml_tensor * rope_long  = nullptr;
-     struct ggml_tensor * rope_short = nullptr;
-+    struct ggml_tensor * rope_freqs = nullptr;
- 
-     // bitnet scale
-     struct ggml_tensor * wq_scale;
-@@ -6143,6 +6144,8 @@ static bool llm_load_tensors(
- 
-                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
- 
-+                        layer.rope_freqs  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS,  "weight"), { n_embd/n_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
-+
-                         if (n_expert == 0) {
-                             layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                             layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-@@ -8620,6 +8623,10 @@ struct llm_build_context {
-         // choose long/short freq factors based on the context size
-         const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
- 
-+        if (model.layers[il].rope_freqs != nullptr) {
-+            return model.layers[il].rope_freqs;
-+        }
-+
-         if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
-             return model.layers[il].rope_long;
-         }
-@@ -8814,6 +8821,9 @@ struct llm_build_context {
- 
-             // self-attention
-             {
-+                // rope freq factors for llama3; may return nullptr for llama2 and other models
-+                struct ggml_tensor * rope_factors = build_rope_factors(il);
-+
-                 // compute Q and K and RoPE them
-                 struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
-                 cb(Qcur, "Qcur", il);
-@@ -8837,14 +8847,14 @@ struct llm_build_context {
-                 }
- 
-                 Qcur = ggml_rope_ext(
--                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
-                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                     ext_factor, attn_factor, beta_fast, beta_slow
-                 );
-                 cb(Qcur, "Qcur", il);
- 
-                 Kcur = ggml_rope_ext(
--                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
-                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                     ext_factor, attn_factor, beta_fast, beta_slow
-                 );
--- 
-2.45.2

From 46e6327e0f85b046f5f92995d7f59146d347cd70 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Mon, 29 Jul 2024 13:35:16 -0700
Subject: [PATCH 51/79] api: add stringifier for `Tool` (#5891)

---
 api/types.go | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/api/types.go b/api/types.go
index 35121813..ea5161ff 100644
--- a/api/types.go
+++ b/api/types.go
@@ -114,6 +114,11 @@ func (t Tools) String() string {
 	return string(bts)
 }
 
+func (t Tool) String() string {
+	bts, _ := json.Marshal(t)
+	return string(bts)
+}
+
 // Message is a single message in a chat sequence. The message contains the
 // role ("system", "user", or "assistant"), the content and an optional list
 // of images.

From 365431d40617b85d0308fec8d0bd9c0cdb1ab3a4 Mon Sep 17 00:00:00 2001
From: royjhan <65097070+royjhan@users.noreply.github.com>
Date: Mon, 29 Jul 2024 13:56:57 -0700
Subject: [PATCH 52/79] return tool calls finish reason for openai (#5995)

* hot fix

* backend stream support

* clean up

* finish reason

* move to openai
---
 openai/openai.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/openai/openai.go b/openai/openai.go
index de6f4bd5..5bd80660 100644
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -218,6 +218,9 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
 			Index:   0,
 			Message: Message{Role: r.Message.Role, Content: r.Message.Content, ToolCalls: toolCalls},
 			FinishReason: func(reason string) *string {
+				if len(toolCalls) > 0 {
+					reason = "tool_calls"
+				}
 				if len(reason) > 0 {
 					return &reason
 				}

From 0be8baad2b684cda667fa5d48bf334382913a09c Mon Sep 17 00:00:00 2001
From: Kim Hallberg <hallberg.kim@gmail.com>
Date: Tue, 30 Jul 2024 08:56:37 +0200
Subject: [PATCH 53/79] Update and Fix example models  (#6065)

* Update example models

* Remove unused README.md
---
 examples/go-chat/main.go                               | 2 +-
 examples/go-generate-streaming/main.go                 | 2 +-
 examples/go-generate/main.go                           | 2 +-
 examples/go-http-generate/README.md                    | 0
 examples/langchain-python-rag-document/README.md       | 8 ++++++++
 examples/langchain-python-rag-document/main.py         | 2 +-
 examples/langchain-python-rag-websummary/README.md     | 4 ++--
 examples/langchain-python-rag-websummary/main.py       | 4 ++--
 examples/langchain-python-simple/README.md             | 4 ++--
 examples/langchain-python-simple/main.py               | 2 +-
 examples/modelfile-mario/Modelfile                     | 2 +-
 examples/modelfile-mario/readme.md                     | 6 +++---
 examples/python-dockerit/dockerit.py                   | 2 +-
 examples/python-json-datagenerator/predefinedschema.py | 2 +-
 examples/python-json-datagenerator/randomaddresses.py  | 2 +-
 examples/python-json-datagenerator/readme.md           | 4 ++--
 examples/python-simplechat/client.py                   | 2 +-
 examples/python-simplechat/readme.md                   | 4 ++--
 examples/typescript-simplechat/client.ts               | 2 +-
 19 files changed, 32 insertions(+), 24 deletions(-)
 delete mode 100644 examples/go-http-generate/README.md

diff --git a/examples/go-chat/main.go b/examples/go-chat/main.go
index 5266f03e..7663fb8f 100644
--- a/examples/go-chat/main.go
+++ b/examples/go-chat/main.go
@@ -35,7 +35,7 @@ func main() {
 
 	ctx := context.Background()
 	req := &api.ChatRequest{
-		Model:    "llama3",
+		Model:    "llama3.1",
 		Messages: messages,
 	}
 
diff --git a/examples/go-generate-streaming/main.go b/examples/go-generate-streaming/main.go
index 49403351..3acfb22a 100644
--- a/examples/go-generate-streaming/main.go
+++ b/examples/go-generate-streaming/main.go
@@ -16,7 +16,7 @@ func main() {
 
 	// By default, GenerateRequest is streaming.
 	req := &api.GenerateRequest{
-		Model:  "gemma",
+		Model:  "gemma2",
 		Prompt: "how many planets are there?",
 	}
 
diff --git a/examples/go-generate/main.go b/examples/go-generate/main.go
index 50fbf64b..2fe28742 100644
--- a/examples/go-generate/main.go
+++ b/examples/go-generate/main.go
@@ -15,7 +15,7 @@ func main() {
 	}
 
 	req := &api.GenerateRequest{
-		Model:  "gemma",
+		Model:  "gemma2",
 		Prompt: "how many planets are there?",
 
 		// set streaming to false
diff --git a/examples/go-http-generate/README.md b/examples/go-http-generate/README.md
deleted file mode 100644
index e69de29b..00000000
diff --git a/examples/langchain-python-rag-document/README.md b/examples/langchain-python-rag-document/README.md
index 20a73a88..e2f3bc02 100644
--- a/examples/langchain-python-rag-document/README.md
+++ b/examples/langchain-python-rag-document/README.md
@@ -4,6 +4,14 @@ This example provides an interface for asking questions to a PDF document.
 
 ## Setup
 
+1. Ensure you have the `llama3.1` model installed:
+
+```
+ollama pull llama3.1
+```
+
+2. Install the Python Requirements.
+
 ```
 pip install -r requirements.txt
 ```
diff --git a/examples/langchain-python-rag-document/main.py b/examples/langchain-python-rag-document/main.py
index 3ed9499f..6f7cec9b 100644
--- a/examples/langchain-python-rag-document/main.py
+++ b/examples/langchain-python-rag-document/main.py
@@ -51,7 +51,7 @@ while True:
         template=template,
     )
 
-    llm = Ollama(model="llama3:8b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
+    llm = Ollama(model="llama3.1", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
     qa_chain = RetrievalQA.from_chain_type(
         llm,
         retriever=vectorstore.as_retriever(),
diff --git a/examples/langchain-python-rag-websummary/README.md b/examples/langchain-python-rag-websummary/README.md
index 3f3b9873..29c706a3 100644
--- a/examples/langchain-python-rag-websummary/README.md
+++ b/examples/langchain-python-rag-websummary/README.md
@@ -4,10 +4,10 @@ This example summarizes the website, [https://ollama.com/blog/run-llama2-uncenso
 
 ## Running the Example
 
-1. Ensure you have the `llama2` model installed:
+1. Ensure you have the `llama3.1` model installed:
 
    ```bash
-   ollama pull llama2
+   ollama pull llama3.1
    ```
 
 2. Install the Python Requirements.
diff --git a/examples/langchain-python-rag-websummary/main.py b/examples/langchain-python-rag-websummary/main.py
index d1b05ba8..77b09fbb 100644
--- a/examples/langchain-python-rag-websummary/main.py
+++ b/examples/langchain-python-rag-websummary/main.py
@@ -5,8 +5,8 @@ from langchain.chains.summarize import load_summarize_chain
 loader = WebBaseLoader("https://ollama.com/blog/run-llama2-uncensored-locally")
 docs = loader.load()
 
-llm = Ollama(model="llama3")
+llm = Ollama(model="llama3.1")
 chain = load_summarize_chain(llm, chain_type="stuff")
 
-result = chain.invoke(docs) 
+result = chain.invoke(docs)
 print(result)
diff --git a/examples/langchain-python-simple/README.md b/examples/langchain-python-simple/README.md
index d4102dec..60db2c8c 100644
--- a/examples/langchain-python-simple/README.md
+++ b/examples/langchain-python-simple/README.md
@@ -4,10 +4,10 @@ This example is a basic "hello world" of using LangChain with Ollama.
 
 ## Running the Example
 
-1. Ensure you have the `llama3` model installed:
+1. Ensure you have the `llama3.1` model installed:
 
    ```bash
-   ollama pull llama3
+   ollama pull llama3.1
    ```
 
 2. Install the Python Requirements.
diff --git a/examples/langchain-python-simple/main.py b/examples/langchain-python-simple/main.py
index 7cb65286..a7ed81d6 100644
--- a/examples/langchain-python-simple/main.py
+++ b/examples/langchain-python-simple/main.py
@@ -1,6 +1,6 @@
 from langchain.llms import Ollama
 
 input = input("What is your question?")
-llm = Ollama(model="llama3")
+llm = Ollama(model="llama3.1")
 res = llm.predict(input)
 print (res)
diff --git a/examples/modelfile-mario/Modelfile b/examples/modelfile-mario/Modelfile
index 33d5952b..a3747086 100644
--- a/examples/modelfile-mario/Modelfile
+++ b/examples/modelfile-mario/Modelfile
@@ -1,4 +1,4 @@
-FROM llama3
+FROM llama3.1
 PARAMETER temperature 1
 SYSTEM """
 You are Mario from super mario bros, acting as an assistant.
diff --git a/examples/modelfile-mario/readme.md b/examples/modelfile-mario/readme.md
index e4f0d417..c3f34197 100644
--- a/examples/modelfile-mario/readme.md
+++ b/examples/modelfile-mario/readme.md
@@ -2,12 +2,12 @@
 
 # Example character: Mario
 
-This example shows how to create a basic character using Llama3 as the base model.
+This example shows how to create a basic character using Llama3.1 as the base model.
 
 To run this example:
 
 1. Download the Modelfile
-2. `ollama pull llama3` to get the base model used in the model file.
+2. `ollama pull llama3.1` to get the base model used in the model file.
 3. `ollama create NAME -f ./Modelfile`
 4. `ollama run NAME`
 
@@ -18,7 +18,7 @@ Ask it some questions like "Who are you?" or "Is Peach in trouble again?"
 What the model file looks like:
 
 ```
-FROM llama3
+FROM llama3.1
 PARAMETER temperature 1
 SYSTEM """
 You are Mario from Super Mario Bros, acting as an assistant.
diff --git a/examples/python-dockerit/dockerit.py b/examples/python-dockerit/dockerit.py
index b013102f..6a288d90 100644
--- a/examples/python-dockerit/dockerit.py
+++ b/examples/python-dockerit/dockerit.py
@@ -4,7 +4,7 @@ imageName = input("Enter the name of the image: ")
 client = docker.from_env()
 s = requests.Session()
 output=""
-with s.post('http://localhost:11434/api/generate', json={'model': 'dockerit', 'prompt': inputDescription}, stream=True) as r:
+with s.post('http://localhost:11434/api/generate', json={'model': 'mattw/dockerit', 'prompt': inputDescription}, stream=True) as r:
   for line in r.iter_lines():
     if line:
       j = json.loads(line)
diff --git a/examples/python-json-datagenerator/predefinedschema.py b/examples/python-json-datagenerator/predefinedschema.py
index 1fd54892..68090ad7 100644
--- a/examples/python-json-datagenerator/predefinedschema.py
+++ b/examples/python-json-datagenerator/predefinedschema.py
@@ -2,7 +2,7 @@ import requests
 import json
 import random
 
-model = "llama3"
+model = "llama3.1"
 template = {
   "firstName": "",
   "lastName": "",
diff --git a/examples/python-json-datagenerator/randomaddresses.py b/examples/python-json-datagenerator/randomaddresses.py
index 72b1fefb..878c9803 100644
--- a/examples/python-json-datagenerator/randomaddresses.py
+++ b/examples/python-json-datagenerator/randomaddresses.py
@@ -12,7 +12,7 @@ countries = [
     "France",
 ]
 country = random.choice(countries)
-model = "llama3"
+model = "llama3.1"
 
 prompt = f"generate one realistically believable sample data set of a persons first name, last name, address in {country}, and phone number. Do not use common names. Respond using JSON. Key names should have no backslashes, values should use plain ascii with no special characters."
 
diff --git a/examples/python-json-datagenerator/readme.md b/examples/python-json-datagenerator/readme.md
index 88357044..5b444dff 100644
--- a/examples/python-json-datagenerator/readme.md
+++ b/examples/python-json-datagenerator/readme.md
@@ -6,10 +6,10 @@ There are two python scripts in this example. `randomaddresses.py` generates ran
 
 ## Running the Example
 
-1. Ensure you have the `llama3` model installed:
+1. Ensure you have the `llama3.1` model installed:
 
    ```bash
-   ollama pull llama3
+   ollama pull llama3.1
    ```
 
 2. Install the Python Requirements.
diff --git a/examples/python-simplechat/client.py b/examples/python-simplechat/client.py
index f82a16b3..85043d5f 100644
--- a/examples/python-simplechat/client.py
+++ b/examples/python-simplechat/client.py
@@ -2,7 +2,7 @@ import json
 import requests
 
 # NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
-model = "llama3"  # TODO: update this for whatever model you wish to use
+model = "llama3.1"  # TODO: update this for whatever model you wish to use
 
 
 def chat(messages):
diff --git a/examples/python-simplechat/readme.md b/examples/python-simplechat/readme.md
index dd2576bc..4c2ded4d 100644
--- a/examples/python-simplechat/readme.md
+++ b/examples/python-simplechat/readme.md
@@ -4,10 +4,10 @@ The **chat** endpoint is one of two ways to generate text from an LLM with Ollam
 
 ## Running the Example
 
-1. Ensure you have the `llama3` model installed:
+1. Ensure you have the `llama3.1` model installed:
 
    ```bash
-   ollama pull llama3
+   ollama pull llama3.1
    ```
 
 2. Install the Python Requirements.
diff --git a/examples/typescript-simplechat/client.ts b/examples/typescript-simplechat/client.ts
index a1e0eea3..8ad113b1 100644
--- a/examples/typescript-simplechat/client.ts
+++ b/examples/typescript-simplechat/client.ts
@@ -1,6 +1,6 @@
 import * as readline from "readline";
 
-const model = "llama3";
+const model = "llama3.1";
 type Message = {
   role: "assistant" | "user" | "system";
   content: string;

From 345420998e90090d2d6fba38ad5c2f3f5512adf4 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Mon, 22 Jul 2024 11:57:26 -0700
Subject: [PATCH 54/79] Prevent partial loading on mixed GPU brands

In mult-brand GPU setups, if we couldn't fully load the model we
would fall through the scheduler and mistakenly try to load across
a mix of brands.  This makes sure we find the set of GPU(s) that
best fit for the partial load.
---
 server/sched.go      | 31 +++++++++++++++++++++++++++----
 server/sched_test.go | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 4 deletions(-)

diff --git a/server/sched.go b/server/sched.go
index 2daed3ab..92b8d508 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -212,9 +212,12 @@ func (s *Scheduler) processPending(ctx context.Context) {
 					} else if loadedCount == 0 {
 						// No models loaded. Load the model but prefer the best fit.
 						slog.Debug("loading first model", "model", pending.model.ModelPath)
-						g := pickBestFitGPUs(pending, ggml, gpus, &numParallel)
+						g := pickBestFullFitByLibrary(pending, ggml, gpus, &numParallel)
 						if g != nil {
 							gpus = g
+						} else {
+							// Only allow partial loads when this is the first model
+							gpus = pickBestPartialFitByLibrary(pending, ggml, gpus, &numParallel)
 						}
 						s.loadFn(pending, ggml, gpus, numParallel)
 						break
@@ -231,7 +234,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 
 						// Update free memory from currently loaded models
 						s.updateFreeSpace(availGpus)
-						fitGpus := pickBestFitGPUs(pending, ggml, availGpus, &numParallel)
+						fitGpus := pickBestFullFitByLibrary(pending, ggml, availGpus, &numParallel)
 						if fitGpus != nil {
 							slog.Debug("new model fits with existing models, loading")
 							s.loadFn(pending, ggml, fitGpus, numParallel)
@@ -668,11 +671,12 @@ func (a ByDuration) Less(i, j int) bool {
 // func (a BySize) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 // func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM }
 
-// pickBestFitGPUs will try to find the optimal placement of the model in the available GPUs where the model fully fits
+// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
+// The list of GPUs returned will always be the same brand (library)
 // If the model can not be fit fully within the available GPU(s) nil is returned
 // If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
 // opts.NumCtx accordingly
-func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
+func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
 	var estimatedVRAM uint64
 
 	var numParallelToTry []int
@@ -723,6 +727,25 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
 	return nil
 }
 
+// If multiple Libraries are detected, pick the Library which loads the most layers for the model
+func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
+	*numParallel = 1
+	byLibrary := gpus.ByLibrary()
+	if len(byLibrary) <= 1 {
+		return gpus
+	}
+	var bestEstimate uint64
+	var bestFit int
+	for i, gl := range byLibrary {
+		_, estimatedVRAM := llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
+		if estimatedVRAM > bestEstimate {
+			bestEstimate = estimatedVRAM
+			bestFit = i
+		}
+	}
+	return byLibrary[bestFit]
+}
+
 // findRunnerToUnload finds a runner to unload to make room for a new model
 func (s *Scheduler) findRunnerToUnload() *runnerRef {
 	s.loadedMu.Lock()
diff --git a/server/sched_test.go b/server/sched_test.go
index 9ddd1fab..a186ce0e 100644
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -666,6 +666,45 @@ func TestAlreadyCanceled(t *testing.T) {
 	require.Empty(t, scenario1a.req.successCh)
 }
 
+func TestHomogeneousGPUs(t *testing.T) {
+	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
+	defer done()
+	s := InitScheduler(ctx)
+
+	s.getGpuFn = func() gpu.GpuInfoList {
+		// Set memory values to require the model to be spread
+		gpus := []gpu.GpuInfo{
+			{Library: "cuda"},
+			{Library: "rocm"},
+		}
+		gpus[0].TotalMemory = 1 * format.GibiByte
+		gpus[0].FreeMemory = 256 * format.MebiByte
+		gpus[1].TotalMemory = 1 * format.GibiByte
+		gpus[1].FreeMemory = 256 * format.MebiByte
+		return gpus
+	}
+	s.getCpuFn = getCpuFn
+	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
+	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+		require.Len(t, gpus, 1)
+		return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
+	}
+	slog.Info("a")
+	s.pendingReqCh <- a.req
+	require.Len(t, s.pendingReqCh, 1)
+	s.Run(ctx)
+	select {
+	case resp := <-a.req.successCh:
+		require.Equal(t, resp.llama, a.srv)
+		require.Empty(t, s.pendingReqCh)
+		require.Empty(t, a.req.errCh)
+	case err := <-a.req.errCh:
+		t.Fatal(err.Error())
+	case <-ctx.Done():
+		t.Fatal("timeout")
+	}
+}
+
 type mockLlm struct {
 	pingResp           error
 	waitResp           error

From 1b44d873e74f62de4f53f154da386919c1426f8b Mon Sep 17 00:00:00 2001
From: royjhan <65097070+royjhan@users.noreply.github.com>
Date: Tue, 30 Jul 2024 13:12:21 -0700
Subject: [PATCH 55/79] Add Metrics to `api\embed` response (#5709)

* add prompt tokens to embed response

* rm slog

* metrics

* types

* prompt n

* clean up

* reset submodule

* update tests

* test name

* list metrics
---
 api/types.go              |  4 ++++
 integration/embed_test.go |  8 ++++++++
 llm/ext_server/server.cpp |  7 ++++++-
 llm/server.go             | 13 +++++++------
 server/routes.go          | 18 ++++++++++++------
 server/sched_test.go      |  4 ++--
 6 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/api/types.go b/api/types.go
index ea5161ff..c2529652 100644
--- a/api/types.go
+++ b/api/types.go
@@ -267,6 +267,10 @@ type EmbedRequest struct {
 type EmbedResponse struct {
 	Model      string      `json:"model"`
 	Embeddings [][]float32 `json:"embeddings"`
+
+	TotalDuration   time.Duration `json:"total_duration,omitempty"`
+	LoadDuration    time.Duration `json:"load_duration,omitempty"`
+	PromptEvalCount int           `json:"prompt_eval_count,omitempty"`
 }
 
 // EmbeddingRequest is the request passed to [Client.Embeddings].
diff --git a/integration/embed_test.go b/integration/embed_test.go
index 61b36fa2..10333d5d 100644
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@@ -69,6 +69,10 @@ func TestAllMiniLMEmbed(t *testing.T) {
 	if !floatsEqual32(res.Embeddings[0][0], 0.010071031) {
 		t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
 	}
+
+	if res.PromptEvalCount != 8 {
+		t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
+	}
 }
 
 func TestAllMiniLMBatchEmbed(t *testing.T) {
@@ -97,6 +101,10 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
 	if !floatsEqual32(res.Embeddings[0][0], 0.010071031) || !floatsEqual32(res.Embeddings[1][0], -0.009802706) {
 		t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
 	}
+
+	if res.PromptEvalCount != 16 {
+		t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
+	}
 }
 
 func TestAllMiniLMEmbedTruncate(t *testing.T) {
diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp
index 0d51460c..d72bb1b1 100644
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -1221,6 +1221,7 @@ struct llama_server_context
                 res.result_json = json
                 {
                     {"embedding", std::vector<float>(embd, embd + n_embd)},
+                    {"timings",             slot.get_formated_timings()},
                 };
             }
         }
@@ -3203,11 +3204,15 @@ int main(int argc, char **argv) {
 
                     responses = result.result_json.value("results", std::vector<json>{result.result_json});
                     json embeddings = json::array();
+
+                    int prompt_n = 0;
                     for (auto & elem : responses) {
                         embeddings.push_back(elem.at("embedding"));
+                        prompt_n += elem.at("timings").at("prompt_n").get<int>();
                     }
+
                     // send the result
-                    json embedding_res = json{{"embedding", embeddings}};
+                    json embedding_res = json{{"embedding", embeddings}, {"prompt_n", prompt_n}};
                     return res.set_content(embedding_res.dump(), "application/json; charset=utf-8");
                 }
             });
diff --git a/llm/server.go b/llm/server.go
index 8127960f..afde077e 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -33,7 +33,7 @@ type LlamaServer interface {
 	Ping(ctx context.Context) error
 	WaitUntilRunning(ctx context.Context) error
 	Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
-	Embed(ctx context.Context, input []string) ([][]float32, error)
+	Embed(ctx context.Context, input []string) (*EmbedResponse, error)
 	Tokenize(ctx context.Context, content string) ([]int, error)
 	Detokenize(ctx context.Context, tokens []int) (string, error)
 	Close() error
@@ -879,10 +879,11 @@ type EmbedRequest struct {
 }
 
 type EmbedResponse struct {
-	Embedding [][]float32 `json:"embedding"`
+	Embedding       [][]float32 `json:"embedding"`
+	PromptEvalCount int         `json:"prompt_n"`
 }
 
-func (s *llmServer) Embed(ctx context.Context, input []string) ([][]float32, error) {
+func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse, error) {
 	if err := s.sem.Acquire(ctx, 1); err != nil {
 		slog.Error("Failed to acquire semaphore", "error", err)
 		return nil, err
@@ -924,12 +925,12 @@ func (s *llmServer) Embed(ctx context.Context, input []string) ([][]float32, err
 		return nil, fmt.Errorf("%s", body)
 	}
 
-	var embedding EmbedResponse
-	if err := json.Unmarshal(body, &embedding); err != nil {
+	var e EmbedResponse
+	if err := json.Unmarshal(body, &e); err != nil {
 		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
 	}
 
-	return embedding.Embedding, nil
+	return &e, nil
 }
 
 type TokenizeRequest struct {
diff --git a/server/routes.go b/server/routes.go
index e6ffe526..a560f369 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -284,6 +284,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 }
 
 func (s *Server) EmbedHandler(c *gin.Context) {
+	checkpointStart := time.Now()
 	var req api.EmbedRequest
 	err := c.ShouldBindJSON(&req)
 	switch {
@@ -332,6 +333,8 @@ func (s *Server) EmbedHandler(c *gin.Context) {
 		return
 	}
 
+	checkpointLoaded := time.Now()
+
 	kvData, err := getKVData(m.ModelPath, false)
 	if err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
@@ -370,13 +373,16 @@ func (s *Server) EmbedHandler(c *gin.Context) {
 		return
 	}
 
-	for i, e := range embeddings {
-		embeddings[i] = normalize(e)
+	for i, e := range embeddings.Embedding {
+		embeddings.Embedding[i] = normalize(e)
 	}
 
 	resp := api.EmbedResponse{
-		Model:      req.Model,
-		Embeddings: embeddings,
+		Model:           req.Model,
+		Embeddings:      embeddings.Embedding,
+		TotalDuration:   time.Since(checkpointStart),
+		LoadDuration:    checkpointLoaded.Sub(checkpointStart),
+		PromptEvalCount: embeddings.PromptEvalCount,
 	}
 	c.JSON(http.StatusOK, resp)
 }
@@ -428,9 +434,9 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
 		return
 	}
 
-	embedding := make([]float64, len(embeddings[0]))
+	embedding := make([]float64, len(embeddings.Embedding[0]))
 
-	for i, v := range embeddings[0] {
+	for i, v := range embeddings.Embedding[0] {
 		embedding[i] = float64(v)
 	}
 
diff --git a/server/sched_test.go b/server/sched_test.go
index a186ce0e..4f8789fa 100644
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -709,7 +709,7 @@ type mockLlm struct {
 	pingResp           error
 	waitResp           error
 	completionResp     error
-	embedResp          [][]float32
+	embedResp          *llm.EmbedResponse
 	embedRespErr       error
 	tokenizeResp       []int
 	tokenizeRespErr    error
@@ -727,7 +727,7 @@ func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitRes
 func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
 	return s.completionResp
 }
-func (s *mockLlm) Embed(ctx context.Context, input []string) ([][]float32, error) {
+func (s *mockLlm) Embed(ctx context.Context, input []string) (*llm.EmbedResponse, error) {
 	return s.embedResp, s.embedRespErr
 }
 func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {

From afa8d6e9d56da834a03df7817d065f6c8b46e102 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Tue, 30 Jul 2024 18:06:26 -0700
Subject: [PATCH 56/79] patch gemma support

---
 llm/patches/10-params.diff | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 llm/patches/10-params.diff

diff --git a/llm/patches/10-params.diff b/llm/patches/10-params.diff
new file mode 100644
index 00000000..56699b8e
--- /dev/null
+++ b/llm/patches/10-params.diff
@@ -0,0 +1,20 @@
+diff --git a/src/llama.cpp b/src/llama.cpp
+index a207451f..fba6b175 100644
+--- a/src/llama.cpp
++++ b/src/llama.cpp
+@@ -4969,6 +4969,7 @@ static void llm_load_hparams(
+                 hparams.attn_soft_cap = true;
+ 
+                 switch (hparams.n_layer) {
++                    case 26: model.type = e_model::MODEL_2B; break;
+                     case 42: model.type = e_model::MODEL_9B; break;
+                     case 46: model.type = e_model::MODEL_27B; break;
+                     default: model.type = e_model::MODEL_UNKNOWN;
+@@ -11736,6 +11737,7 @@ struct llm_build_context {
+ 
+                 // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
+                 switch (model.type) {
++                    case e_model::MODEL_2B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
+                     case e_model::MODEL_9B:  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));   break;
+                     case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
+                     default: GGML_ABORT("fatal error");

From 5d6657835669064fa9658e6712b01887a072c606 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Tue, 30 Jul 2024 18:08:34 -0700
Subject: [PATCH 57/79] Update README.md

Better example for multi-modal input
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 824b3761..0593a785 100644
--- a/README.md
+++ b/README.md
@@ -173,7 +173,7 @@ I'm a basic program that prints the famous "Hello, world!" message to the consol
 ### Multimodal models
 
 ```
->>> What's in this image? /Users/jmorgan/Desktop/smile.png
+ollama run llava "What's in this image? /Users/jmorgan/Desktop/smile.png"
 The image features a yellow smiley face, which is likely the central focus of the picture.
 ```
 

From 3579b4966a9b21e048db4f7610e3f9f4a5c4dc64 Mon Sep 17 00:00:00 2001
From: Michael <mchiang0610@users.noreply.github.com>
Date: Tue, 30 Jul 2024 18:40:09 -0700
Subject: [PATCH 58/79] Update README to include Firebase Genkit (#6083)

Firebase Genkit
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 0593a785..941a4f99 100644
--- a/README.md
+++ b/README.md
@@ -337,6 +337,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 ### Libraries
 
 - [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
+- [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
 - [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
 - [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
 - [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)

From 463a8aa2731a9fe5258c6c7e1466f3dae27f0c6a Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Tue, 30 Jul 2024 21:01:12 -0700
Subject: [PATCH 59/79] Create SECURITY.md

---
 SECURITY.md | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 SECURITY.md

diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 00000000..d38bb7c4
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,25 @@
+# Security
+
+The Ollama maintainer team takes security seriously and will actively work to resolve security issues.
+
+## Reporting a vulnerability
+
+If you discover a security vulnerability, please do not open a public issue. Instead, please report it by emailing hello@ollama.com. We ask that you give us sufficient time to investigate and address the vulnerability before disclosing it publicly.
+
+Please include the following details in your report:
+- A description of the vulnerability
+- Steps to reproduce the issue
+- Your assessment of the potential impact
+- Any possible mitigations
+
+## Security best practices
+
+While the maintainer team does their best to secure Ollama, users are encouraged to implement their own security best practices, such as:
+
+- Regularly updating to the latest version of Ollama
+- Securing access to hosted instances of Ollama
+- Monitoring systems for unusual activity
+
+## Contact
+
+For any other questions or concerns related to security, please contact us at hello@ollama.com

From 71399aa682726e472ca271f02417d87f6f8be429 Mon Sep 17 00:00:00 2001
From: Daniel Nguyen <long@podzim.co>
Date: Wed, 31 Jul 2024 22:44:58 +0700
Subject: [PATCH 60/79] Added BoltAI as a desktop UI for Ollama (#6096)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 941a4f99..0cc15266 100644
--- a/README.md
+++ b/README.md
@@ -299,6 +299,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [AI Studio](https://github.com/MindWorkAI/AI-Studio)
 - [Sidellama](https://github.com/gyopak/sidellama) (browser-based LLM client)
 - [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
+- [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
 
 ### Terminal
 

From 6b252918fb5e17f9be5975efe1681a92153b8379 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 3 Jun 2024 09:49:13 -0700
Subject: [PATCH 61/79] update convert test to check result data

---
 convert/convert_test.go                       | 111 +++++--
 .../testdata/Meta-Llama-3-8B-Instruct.json    | 313 ++++++++++++++++++
 .../testdata/Mistral-7B-Instruct-v0.2.json    | 313 ++++++++++++++++++
 .../testdata/Mixtral-8x7B-Instruct-v0.1.json  |   1 +
 convert/testdata/gemma-2b-it.json             | 188 +++++++++++
 llm/ggla.go                                   |  14 +-
 llm/ggml.go                                   |   7 +-
 llm/gguf.go                                   |  14 +-
 8 files changed, 924 insertions(+), 37 deletions(-)
 create mode 100644 convert/testdata/Meta-Llama-3-8B-Instruct.json
 create mode 100644 convert/testdata/Mistral-7B-Instruct-v0.2.json
 create mode 100644 convert/testdata/Mixtral-8x7B-Instruct-v0.1.json
 create mode 100644 convert/testdata/gemma-2b-it.json

diff --git a/convert/convert_test.go b/convert/convert_test.go
index 6aa33a49..a3727bed 100644
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -1,29 +1,36 @@
-//go:build slow
-
 package convert
 
 import (
+	"crypto/sha256"
+	"encoding/json"
+	"flag"
+	"fmt"
+	"io"
+	"log/slog"
+	"math"
 	"os"
 	"path/filepath"
+	"slices"
 	"testing"
 
 	"github.com/ollama/ollama/llm"
+	"golang.org/x/exp/maps"
 )
 
-func convertFull(t *testing.T, p string) (llm.KV, llm.Tensors) {
+func convertFull(t *testing.T, d string) (*os.File, llm.KV, llm.Tensors) {
 	t.Helper()
 
-	mf, err := GetModelFormat(p)
+	mf, err := GetModelFormat(d)
 	if err != nil {
 		t.Fatal(err)
 	}
 
-	params, err := mf.GetParams(p)
+	params, err := mf.GetParams(d)
 	if err != nil {
 		t.Fatal(err)
 	}
 
-	arch, err := mf.GetModelArch("", p, params)
+	arch, err := mf.GetModelArch("", d, params)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -50,53 +57,91 @@ func convertFull(t *testing.T, p string) (llm.KV, llm.Tensors) {
 	if err != nil {
 		t.Fatal(err)
 	}
-	defer r.Close()
+	t.Cleanup(func() { r.Close() })
 
-	m, _, err := llm.DecodeGGML(r)
+	m, _, err := llm.DecodeGGML(r, math.MaxInt)
 	if err != nil {
 		t.Fatal(err)
 	}
 
-	return m.KV(), m.Tensors()
+	if _, err := r.Seek(0, io.SeekStart); err != nil {
+		t.Fatal(err)
+	}
+
+	return r, m.KV(), m.Tensors()
+}
+
+func TestMain(m *testing.M) {
+	var level slog.Level
+	flag.TextVar(&level, "level", slog.LevelInfo, "log level")
+	flag.Parse()
+	slog.SetLogLoggerLevel(level)
+	os.Exit(m.Run())
 }
 
 func TestConvertFull(t *testing.T) {
-	cases := []struct {
-		path    string
-		arch    string
-		tensors int
-		layers  int
-	}{
-		{"Meta-Llama-3-8B-Instruct", "llama", 291, 35},
-		{"Mistral-7B-Instruct-v0.2", "llama", 291, 35},
-		{"Mixtral-8x7B-Instruct-v0.1", "llama", 291, 35},
-		{"gemma-2b-it", "gemma", 164, 20},
+	cases := []string{
+		"Meta-Llama-3-8B-Instruct",
+		"Mistral-7B-Instruct-v0.2",
+		"Mixtral-8x7B-Instruct-v0.1",
+		"gemma-2b-it",
 	}
 
-	for _, tt := range cases {
-		t.Run(tt.path, func(t *testing.T) {
-			p := filepath.Join("testdata", tt.path)
-			if _, err := os.Stat(p); err != nil {
+	for i := range cases {
+		tt := cases[i]
+		t.Run(tt, func(t *testing.T) {
+			t.Parallel()
+
+			p := filepath.Join("testdata", tt)
+			if testing.Short() {
+				t.Skip("skipping in short mode")
+			} else if _, err := os.Stat(p); err != nil {
 				t.Skipf("%s not found", p)
 			}
 
-			kv, tensors := convertFull(t, p)
+			f, kv, tensors := convertFull(t, p)
+			actual := make(map[string]string)
+			for k, v := range kv {
+				if s, ok := v.(json.Marshaler); !ok {
+					actual[k] = fmt.Sprintf("%v", v)
+				} else {
+					bts, err := json.Marshal(s)
+					if err != nil {
+						t.Fatal(err)
+					}
 
-			if kv.Architecture() != tt.arch {
-				t.Fatalf("expected llama, got %s", kv.Architecture())
+					actual[k] = fmt.Sprintf("%x", sha256.Sum256(bts))
+				}
 			}
 
-			if kv.FileType().String() != "F16" {
-				t.Fatalf("expected F16, got %s", kv.FileType())
+			for _, tensor := range tensors.Items {
+				sha256sum := sha256.New()
+				sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size()))
+				if _, err := io.Copy(sha256sum, sr); err != nil {
+					t.Fatal(err)
+				}
+
+				actual[tensor.Name] = fmt.Sprintf("%x", sha256sum.Sum(nil))
 			}
 
-			if len(tensors) != tt.tensors {
-				t.Fatalf("expected %d tensors, got %d", tt.tensors, len(tensors))
+			expectFile, err := os.Open(filepath.Join("testdata", fmt.Sprintf("%s.json", tt)))
+			if err != nil {
+				t.Fatal(err)
 			}
 
-			layers := tensors.Layers()
-			if len(layers) != tt.layers {
-				t.Fatalf("expected %d layers, got %d", tt.layers, len(layers))
+			var expect map[string]string
+			if err := json.NewDecoder(expectFile).Decode(&expect); err != nil {
+				t.Fatal(err)
+			}
+
+			keys := maps.Keys(expect)
+			slices.Sort(keys)
+			for _, k := range keys {
+				if v, ok := actual[k]; !ok {
+					t.Errorf("missing %s", k)
+				} else if v != expect[k] {
+					t.Errorf("unexpected %s: want %s, got %s", k, expect[k], v)
+				}
 			}
 		})
 	}
diff --git a/convert/testdata/Meta-Llama-3-8B-Instruct.json b/convert/testdata/Meta-Llama-3-8B-Instruct.json
new file mode 100644
index 00000000..808826bb
--- /dev/null
+++ b/convert/testdata/Meta-Llama-3-8B-Instruct.json
@@ -0,0 +1,313 @@
+{
+  "general.architecture": "llama",
+  "general.file_type": "1",
+  "general.quantization_version": "2",
+  "llama.block_count": "32",
+  "llama.context_length": "8192",
+  "llama.embedding_length": "4096",
+  "llama.feed_forward_length": "14336",
+  "llama.rope.dimension_count": "128",
+  "llama.rope.freq_base": "500000",
+  "llama.vocab_size": "128256",
+  "llama.attention.head_count": "32",
+  "llama.attention.head_count_kv": "8",
+  "llama.attention.layer_norm_rms_epsilon": "1e-05",
+  "tokenizer.ggml.model": "gpt2",
+  "tokenizer.ggml.pre": "llama-bpe",
+  "tokenizer.ggml.bos_token_id": "128000",
+  "tokenizer.ggml.eos_token_id": "128009",
+  "tokenizer.ggml.merges": "d0cbac1fcc9dcf03724b8db5c9bfb593ae1cf68fb9bc72eb1d15274dcbbf618b",
+  "tokenizer.ggml.token_type": "d70a88809fd7da6f1f028622685cd64268a7a922c5d343c96f25b66327358978",
+  "tokenizer.ggml.tokens": "765b529dbcbc42dd202ce657341c63807b51f3b07e09898f6aa6196326865d5a",
+  "token_embd.weight": "b53102a11d9064bbd404833e3464b1b13e08ce73300b442312cccde2f19b2698",
+  "blk.0.attn_norm.weight": "7318df3cca9e8d153ff0a503026a1265e63d20b2a8c1dd7a2769585082b5d1ee",
+  "blk.0.ffn_down.weight": "b950806a1fc722c9fad7fd0b20c3c0a7fb50f14395e1e7663a590bfd62e20900",
+  "blk.0.ffn_gate.weight": "e73e580af6d4f08e060a74a3c25efdf5d3bed99e183d95a5a85ae859014839fd",
+  "blk.0.ffn_up.weight": "c8158af679ef99746da1befb67eebb19489e0bbe6ce7d97e13e348508244e516",
+  "blk.0.ffn_norm.weight": "7ec69c3c31e95e49a3359003b0033f6b9e85561a3e3fd83e7476661ecdd756bb",
+  "blk.0.attn_k.weight": "2732303257bac969b4964e0e32ec08b5a7f5c031bb02bf6ac4467b3ea0ebcf1e",
+  "blk.0.attn_output.weight": "ecda1d43b4ccc91cd5b366d7e7a275353990ac78561a07c83d9c77031aba12dc",
+  "blk.0.attn_q.weight": "569b1f5faf92b6f00910cf7effb2d5862f91038ce5c3b0019fc10e5d79fbd5e1",
+  "blk.0.attn_v.weight": "aa8416c5ef7e32fb54a1f20d6ac651656845d4af240564b397c39bd83e06e3b8",
+  "blk.1.attn_norm.weight": "03327e02862908c2a44b2f52decdb924bf4201f400b46f8037a9cb2e1d7a61ff",
+  "blk.1.ffn_down.weight": "5a83a87603f38c99f8e1e370a2d5f967bb45ac51d881a609304a7811027321e0",
+  "blk.1.ffn_gate.weight": "31da0572c79e655186c721c231376f85e56cdcc6257c28d08c8c5b40d5c22b40",
+  "blk.1.ffn_up.weight": "e0c811d64ca155c8de10a868e72015d43888834804614ee1aa2953129ffbc90f",
+  "blk.1.ffn_norm.weight": "5861f313d6137d6f0f904d423df47fffc6069e224ff746e1b637ac9c7f0af862",
+  "blk.1.attn_k.weight": "5fbbec0acca6457b9416ebdcd90e526885d0224537b7628f6be376a7f275313d",
+  "blk.1.attn_output.weight": "b237c9763fa3f75166a6f70b70f1566e77d0d89dfa164ed1b3137393e90575c3",
+  "blk.1.attn_q.weight": "c0a9cf4a98b4882b16f3eb2b49d933793dcc5357abb246fd3fe3134ed2b12e1c",
+  "blk.1.attn_v.weight": "96867111727200cac1af7865189dd41fd62b47584e5e5f33a91f1d34509cbd40",
+  "blk.2.attn_norm.weight": "f392f8a88ee3a95b1cc19c40dd4ef66317037b0faaa1800f610779e129ee0539",
+  "blk.2.ffn_down.weight": "73823eef46632aedcc8c1cb08a736b6aa97ca97842cd1fdfc5567d8dec459662",
+  "blk.2.ffn_gate.weight": "f4909ae19fc3848b00bb8b9050122e74f8e903b89e22937036f4cc9fea20a718",
+  "blk.2.ffn_up.weight": "16f4904a3d814ea68f00519724fc4943e48444a84c786bda39aa5efc298a7d84",
+  "blk.2.ffn_norm.weight": "e3ccdf56e75cb969f6f69c39caf6daf7c4e70e89e25df0f4d2e4bc60e159aafe",
+  "blk.2.attn_k.weight": "c3beb1e0a11bcf007ef0f0d8f6bdd3082d8b29090cd29597846b5d51e308a8e5",
+  "blk.2.attn_output.weight": "bb9f66c32cff51154fea92933c2cd62549236f8cb1a767f9ef28d3f99809b343",
+  "blk.2.attn_q.weight": "8eba394132eef2a05c5a92d62d2376000f7948448d7a2dc74e6b608203add20d",
+  "blk.2.attn_v.weight": "88f61f77c53567c617db3eef8f30621109a750e679f6784f7911739bd42c2f02",
+  "blk.3.attn_norm.weight": "7b996675b7ca75fa24107b3ebe0788653ede0f49ac83b8659d71ff54d591f81a",
+  "blk.3.ffn_down.weight": "2cb332bc05e4821962fdc9dcbcc7cc12630f32117711b687d18fb53c0bc4fbf4",
+  "blk.3.ffn_gate.weight": "340b387c7f208c8f0a6db904ef8d87c1e84b7d6ad57177abd32d86c8d18b760f",
+  "blk.3.ffn_up.weight": "07484433f8a7ee061c55aa0de2ecc009f769b0617c9c0ec096e9bb2946df9f0e",
+  "blk.3.ffn_norm.weight": "4f1a4ade36b393af341240bc894a2aab09cff7e4d56dc4658445deb107f9371b",
+  "blk.3.attn_k.weight": "483dcd96acb4528df84b9842970994630dbd82b8715ace394aa8b39fcf8d6291",
+  "blk.3.attn_output.weight": "beaff0810687923585642ee11d929cbf3b43dc6f87f30ddb552c222ab57bdbb3",
+  "blk.3.attn_q.weight": "0739355002f6fce520863add697e0ff25fc88215322dc3f993be7bb68dcce7e8",
+  "blk.3.attn_v.weight": "c216d17b6d90ee3e07f82598b8161fae34de2f392dbb0f745b682b578c324767",
+  "blk.4.attn_norm.weight": "91ab405bc4ba15bf63af233f266aa43aaab43789a9e6596e14a357c2ac7df217",
+  "blk.4.ffn_down.weight": "620f34ee75cdc73aecb8949af5fbb0d2437fd81422b6d8eb7acfc52addb9fc68",
+  "blk.4.ffn_gate.weight": "f6feec7bc9acadf35ec22532f8998d8e50f31afedabb19263590dcf8b9a92eee",
+  "blk.4.ffn_up.weight": "4a72af7cd28fd07b038f6cc4406678d120517280236ea85d9e76eff40ab2cc22",
+  "blk.4.ffn_norm.weight": "1805b37b44d5d682bdbd2fadeafb763ee001617d7870848cc487079ee34b21f9",
+  "blk.4.attn_k.weight": "a1e4f9d97cdf4c1b0d177cf00c4e32d1be30c1984a239b3c9bd73f8848888853",
+  "blk.4.attn_output.weight": "a1547e2497c423b0aff0eee71d9300d6fdf4e4986679418b6e637b69a9a6720b",
+  "blk.4.attn_q.weight": "0677483a9264ea6803d03d304d87a54632242cb516e8b76b6e3e8284c2f4de04",
+  "blk.4.attn_v.weight": "02691ba3af344fcc1969428ab0df811ac94aaa2fd91b0dc4ec1ac0a58806980d",
+  "blk.5.attn_norm.weight": "ba9c028335e5c895b87a5bd1448ca429248f9746ed97bdcb8679923206117156",
+  "blk.5.ffn_down.weight": "ccfdc9006acad1940a6bc05042a3947f1066acd671e0bb53b7684e9eea9ef5c9",
+  "blk.5.ffn_gate.weight": "623157679f1e742ccc3807c0b0153ddc8450104de75ec62f1370ec3807c09cf4",
+  "blk.5.ffn_up.weight": "05748804c65091f963729b58b085f58351891cac8a2861f5eae26b06aa60b2a0",
+  "blk.5.ffn_norm.weight": "84bae55af2efc8b8429f09056c8c04990c466dae31cb3f9356038b8957f1b406",
+  "blk.5.attn_k.weight": "8c766180c726b037d587fc52371de6e3307140c52409011609d1225624b6a3eb",
+  "blk.5.attn_output.weight": "490b582b3b1dc151ae55aee8b6743dad6c01fb49e43afefb6e68394b74be3d73",
+  "blk.5.attn_q.weight": "6f7b8ca4d9025ec836a44bbcca46be30c66b471a9fb62943ddff8288b3731409",
+  "blk.5.attn_v.weight": "9f70df3ba00c9e723214b3da83ff435a2163fff5915f75515c9664c05c866c27",
+  "blk.6.attn_norm.weight": "1a4a66613a682df6f061fc7c4d986f9f7e9175b62f0c42fc1ef31db536bd5942",
+  "blk.6.ffn_down.weight": "c56f25e4e49b443dbc82d88311ee63bc1f5002cc67e52f4787fd5f003aedeac1",
+  "blk.6.ffn_gate.weight": "31a5cf1aa9b831a81588d508550f51fc425f9517c43254d4ef7096d38029cf04",
+  "blk.6.ffn_up.weight": "ce135f3a1163e0c9297a615bdbe68a67ead21edce8debbfa9f6e15e6af8d4c94",
+  "blk.6.ffn_norm.weight": "4e328ce0648c94e732bc40501858ef6262ad1161e2e407b0cdcf4813fa9d45d8",
+  "blk.6.attn_k.weight": "1eb1c4c9f9c4c7ff7f5429075e0dc6a7782bed55109fa88df209a817dd8ef960",
+  "blk.6.attn_output.weight": "3d32986b56873b88655ee1edabdd413fdd9ab18b82108c9ce90bdbc2d3a6f3a3",
+  "blk.6.attn_q.weight": "8432f583b3a2809c99c393f9beb077cb0534dd5d247c17108f2986cadc6651f6",
+  "blk.6.attn_v.weight": "5045381513815bb91839dbac8335ffe49bbc7b0008369de7ea97eb676c5e2b36",
+  "blk.7.attn_norm.weight": "3dabd003638ec2499bfc8a48c49eef34276caab4fe76894eb963207848c2fdaf",
+  "blk.7.ffn_down.weight": "194fae858608bdcffd235be59ab119d0b91c8549f864ea06dae69249e099935f",
+  "blk.7.ffn_gate.weight": "00b24c29c30246892bce0791be804a89701d4c1332777e0bcdad5d9d5666604f",
+  "blk.7.ffn_up.weight": "44d7082a5280080c90cef9e19d410391de34f212ca0736377769b8ddd0c82d5e",
+  "blk.7.ffn_norm.weight": "21fe8a7fd6911c64e0d15a788b3b4cb6d71dd6ec51de65f760ee89afbb6ae53e",
+  "blk.7.attn_k.weight": "57a149eec5f6744a9526cd3925ac073f9d12db0fbcb5afe042ef4dc846458c44",
+  "blk.7.attn_output.weight": "0e9c28a3e81a2880251ce5eed77bcb8be8aaa1a51c9cb6de820b47ed83849fc2",
+  "blk.7.attn_q.weight": "15ee75263ee4e2a43eb322bc159ae004bb7d77e3a7e63ee4ddab700430693fff",
+  "blk.7.attn_v.weight": "440aa970bba4bff429fd7b7b1de21f2ad14fb2952b776cfa4acee68d7c6e9b8f",
+  "blk.8.attn_norm.weight": "af5b44825633c42c1ae964c82bb2be6a242d3a751f0a91f1bae4f593e8f5b6ec",
+  "blk.8.ffn_down.weight": "b11c14c76adca94fa200496dd2c10743becb23aab6642443ef1ae6d8710edbc1",
+  "blk.8.ffn_gate.weight": "7bb03d3325bf8637ae2fa1296b0651356515578d46a7c5ca65c7a923d7de27bc",
+  "blk.8.ffn_up.weight": "b956ef0a0669b5a9c9bf3a8da2d1c24f52d331cfb7354f6d7c51bd65be355e30",
+  "blk.8.ffn_norm.weight": "c78c3d748302edfef76f71ea5cb2055c94352122eee8b9b1173779a1814d224e",
+  "blk.8.attn_k.weight": "c0fba6a596ed9c1c32a7055c31a935a8b31e42b77282ee47c1f03ee3bde736b5",
+  "blk.8.attn_output.weight": "83cf9947080c5d8d571f04a842bc3dcfe7bbb0195fb25b346e22635e8649f2d4",
+  "blk.8.attn_q.weight": "47409350a576b333d97b7c877d69f47f46df504f3765102dfc0be9e521c7ecd6",
+  "blk.8.attn_v.weight": "1999dff91404fdcf1ecb34d9eaaaa9244ec7658a74dec8feb7cfd1fddba0347e",
+  "blk.9.attn_norm.weight": "1e6e29d5c3889ab4e1b0a5b9998cba60179b0f1fca133515df49cbc19d092593",
+  "blk.9.ffn_down.weight": "acb898a6490adff592e10b4c62d70edc5941661ee6da44658500e9205357c8e9",
+  "blk.9.ffn_gate.weight": "4cff63013593aadc3ffbaaa6ed70ffdba1224cd43c3644bf6f4162b5ac1ab542",
+  "blk.9.ffn_up.weight": "f985b5a2d6cf4fe32c7256301c3c89b8ad22b59e516342c52da42d8110766a4e",
+  "blk.9.ffn_norm.weight": "0d659c538bc6b21ed0018f107ab674a7424a00a42946c80e07208b479b21918f",
+  "blk.9.attn_k.weight": "f67611d888780d1b38c1c146b361c65310c8183bdf64fd73e2259985c6e8517f",
+  "blk.9.attn_output.weight": "f12ca1fa62a02ddc3f77f798bfb5707e0c50bf18ee0eaa67025521a98355f26b",
+  "blk.9.attn_q.weight": "3865185f4361a645b086ad47b72904c095313fb1c624e511647bf1a7dfc1c476",
+  "blk.9.attn_v.weight": "92125bbfed63544ab56052bd1e4aa453bbf34c795249ee54cde54907c8c6d1d3",
+  "blk.10.attn_norm.weight": "5d6bfbe545bcc2fcb2fc75c68f64b1f4c918badaf53e0156fe2d88aa977b2f94",
+  "blk.10.ffn_down.weight": "1dd9da8b0d2696ab5531fbca8a29c7d67567620a9d3e5fc2a19ec5d7e4c6cc8a",
+  "blk.10.ffn_gate.weight": "6e55e7f014edaebda0ac6819a426221d3b025c27312a2e18cc5806f31e3db226",
+  "blk.10.ffn_up.weight": "d80dde54af5db51241345ee8d64c1972608644f4deeac1e8195dc423bf27474a",
+  "blk.10.ffn_norm.weight": "f6ca65951d58ae3379eee8247bec34ebd0db05674cc9295593573841b8a55df3",
+  "blk.10.attn_k.weight": "b58e350bd6b49aba0fba4e4dd6865de3a2a0651ab865dbf2419b627b53ffc187",
+  "blk.10.attn_output.weight": "6b26a986e12fe66ec286a21d7d5af5eaa1bfe6f2bf502165d270e4497235a54a",
+  "blk.10.attn_q.weight": "3440e0e5b7e0d1e426424ae5a33f4e057be623249e9035ea12e57dbe5d3893c4",
+  "blk.10.attn_v.weight": "ebfadcfe14bcd6dee933053df0a67e12e7a196d5cc45728c1ffb2a2daedd5ca2",
+  "blk.11.attn_norm.weight": "3ed057b9576cd2de84507ef64c7646dc478c651efca4c2024cbe91a4f3fbf0bc",
+  "blk.11.ffn_down.weight": "8ff1c2487d22f5c499761e4eb721418f141f960160d0bab779595a34e4d68898",
+  "blk.11.ffn_gate.weight": "9c74e4507c7e45bf39b7cc7402198cd1dd77e3fff8c625b0413acaeb16efeb9f",
+  "blk.11.ffn_up.weight": "4367158007161d29939e00a322bb6776016e43f648a94f9b08a96a477aae75be",
+  "blk.11.ffn_norm.weight": "1cc0288c1491072121f4c9a0af20be0e13af49895696a3320e4fcac608768de3",
+  "blk.11.attn_k.weight": "066f5b3c144fce1366835e1ebf376f768b333b8ae29f5b478c42d1d0c809c855",
+  "blk.11.attn_output.weight": "e0d9f3d3f2c54aed59c02713ea4fb562799ddbacbe67ca3998dfc887bc44e47b",
+  "blk.11.attn_q.weight": "28d3ecc8a88cb3815e89a7f7a7d043da7a71f702b337a126e4d3a2ac1cd6370f",
+  "blk.11.attn_v.weight": "7c5cdef10ee73bca0a3b9f6ece5f0a0155664e0ce3d8de90ccdccfab5545e5e7",
+  "blk.12.attn_norm.weight": "973b133301a1af760cd7b3a7955371ea0a750808b442deb6adaf7b98482bd0c6",
+  "blk.12.ffn_down.weight": "d6c87b4b4ca03f75546ddd6a9e7fca720585a309188723c1ace8122438d4b200",
+  "blk.12.ffn_gate.weight": "2189a6e0cab1540bd05d6089b922aa8fd694be51255654933c165f302a0c955f",
+  "blk.12.ffn_up.weight": "5affbec19b58d092b9305721e3552481fe2eff51269ea3ed91cda3b9ef84d4df",
+  "blk.12.ffn_norm.weight": "f650fd42a34e950f758b4a130e7b8b1a712b1dcbede0291bb8edde47aaed0ef6",
+  "blk.12.attn_k.weight": "59b1e86f10450a7cc188beefc0856d2dcf44e8d7fdd9cd8859c30ec1ebaf24b6",
+  "blk.12.attn_output.weight": "446b0d36b2f66bd72a2323f4f4e9d85a0f621e9a58872e89a27248d6b1123238",
+  "blk.12.attn_q.weight": "3ed6bfd39f040301ed99fad882d3e569769d594259f9948445bef0e44ec881fb",
+  "blk.12.attn_v.weight": "e73652cd5d0029b1931be3ba9d82508f6696dce5a29d085476a54fb7a2ddbabc",
+  "blk.13.attn_norm.weight": "491b85278c0bd67bd31b9b8a9720902c244bd067e53a4a03641b7c0994782e82",
+  "blk.13.ffn_down.weight": "ad71cc248a85e9ced49307a24a9bfae01d387e979a7689c82ff59998e09741f3",
+  "blk.13.ffn_gate.weight": "0a55984d53971fab97575ee0ef5882013be7fdecfa76e3fbebb5dc85a07a14d4",
+  "blk.13.ffn_up.weight": "378b697b35e2e53c0de98e8e29b73d42ae3ec112ec16129aa5997a9e2f3b5943",
+  "blk.13.ffn_norm.weight": "f8aff2f69ab286210fad45a62b03f8d10b38f96a420d7baadf6b95d7b0b0bcd2",
+  "blk.13.attn_k.weight": "25ceb841afb1034831bea7f4d6a6c578def2ce4d4c412c780ef147dc9a598360",
+  "blk.13.attn_output.weight": "a242b322889c6bdaa14b67a7bab593db39df8eea3721638ef639abbb74d482e3",
+  "blk.13.attn_q.weight": "d80be9945a369439e835c55cfb0e97828b8a66bb7ced534d9059c92487bf20a9",
+  "blk.13.attn_v.weight": "ac33274cf9b67979d9ecdc967a55175afe0c9c4aeeff6391433cd9840c818706",
+  "blk.14.attn_norm.weight": "12a1e1091de5b2da12c9e7c0b1c8e6f09ce2a749733cf7d5240445b8e21cd093",
+  "blk.14.ffn_down.weight": "cfd41965c88266e32bc2dcdadda512499c35519e8686fefb9a7f249ab2291eb5",
+  "blk.14.ffn_gate.weight": "8dcfe774f07a095c7c6cf0a901c9df70d938bad7b5ba347fbc8f694e7603c0d1",
+  "blk.14.ffn_up.weight": "c7995577fe4a72ea0fb17c4a7b6b87b959072bbfdd5edacc6c367d43465809ae",
+  "blk.14.ffn_norm.weight": "81c41ebde41739e7016ffec31d2256217b825dc3cae049a935f5f61a60d22003",
+  "blk.14.attn_k.weight": "fb708bdebe4384f5c4b479c110028554f4d122f166b8091eda7d8d65e6780eb8",
+  "blk.14.attn_output.weight": "f5295caf2dfdc60553dcabe17537a80577e8b153c902247daac058df23542514",
+  "blk.14.attn_q.weight": "c12b7a3601c68c63ab5dc9d2599ebf3f3a10abc2c59d3a2126fffd5818f2763b",
+  "blk.14.attn_v.weight": "1ce968d9149bf0d5e237d52cc6d6433565b4bbf03252a736262bb00a2b34a687",
+  "blk.15.attn_norm.weight": "266fd2c36d7dcefc6b6bb7f1c9374c41f2bab5d6c84a063b6f91c4f682dad3c4",
+  "blk.15.ffn_down.weight": "6154886e9ef0a6cc08ab0d264a35f497e6f0987efdac992ed04e87088bea7801",
+  "blk.15.ffn_gate.weight": "183d9fd3c1b5657840099053d2fd3f72ad953b1de523296159b7761f20491a76",
+  "blk.15.ffn_up.weight": "51546d4498842ae2340ee226a0888d5f61e7d2ca4d052dfa06a77b0451242d3d",
+  "blk.15.ffn_norm.weight": "ef7378091a41a25a5f58bf1bf9d3bc64ea562e7f421e1c232b1f177c30fd3500",
+  "blk.15.attn_k.weight": "8d556ab8d9639324141774999b6eed0e91d7ee645bf3e7a3dcd200b2e7a00751",
+  "blk.15.attn_output.weight": "54aa6ba87def7cbe18b0c6ab3aff5c351cb3b6ca4a0d7b2cd5f75a1312991429",
+  "blk.15.attn_q.weight": "10731b0dc031ea8e0ef37bd7f010e0a78518a10a6df05a8bae48e3148b73ef3e",
+  "blk.15.attn_v.weight": "cbbe50c2ed7224866d3cf9b489c599f3ec41a4ea1aa3181e9f4e87e1fa0cefec",
+  "blk.16.attn_norm.weight": "387058eb39d4b28c04cf1368247417f1faeae8ae79d894c9f293457e0eaa00b0",
+  "blk.16.ffn_down.weight": "2cb26ccee585e933401ad5c82ed36ddacb3289efa0b28f8cf91b020ffbd9c333",
+  "blk.16.ffn_gate.weight": "d745985efb5bab42304e5d509024631efe35f92f2b2ec4931ead6db97ca9727e",
+  "blk.16.ffn_up.weight": "7a67bd195e0642828ca36eb7818149bb70c2c25f82de07e2b5807c520daf540e",
+  "blk.16.ffn_norm.weight": "7cefd061c8182482a89272f8a4e88a954b12609a62716923ca1cb3593b1c1651",
+  "blk.16.attn_k.weight": "d7968a2de67e755b4533e061aaad1cb62f8882af92dcad67f99d6d5112513439",
+  "blk.16.attn_output.weight": "9e9ab5788272ca3394ea89eadbce8c86ecc3fd75b7899184d6191c134ad9aae0",
+  "blk.16.attn_q.weight": "ef81c261b536c1a3a093b33f44cf2d42b86e5aa2d821674f07a0c80e992ed925",
+  "blk.16.attn_v.weight": "aef38e7958301b4a437cbdd2fbae6197f677b09269ec1eaf63188cd5da428d25",
+  "blk.17.attn_norm.weight": "28f6b289f1bc3131041e9f791b7a2a3a48baee0dfea27bf7051ebbb7ed364d80",
+  "blk.17.ffn_down.weight": "1a502829aafc6a9bd6bc81f12573bf8632d5c8c659f0dfb13c8b2411f3b1ec05",
+  "blk.17.ffn_gate.weight": "ddfd8aa0eb98846ebc9afe31366249159f46ae9815199dd70161527ed241ac4d",
+  "blk.17.ffn_up.weight": "4211a3cc247071bd361b30de2131d02382f552855062bf3b3e004c17992e5d09",
+  "blk.17.ffn_norm.weight": "647e5fa99a5b0d232af36d15816539f4d27e60a50a341b00aa88bb6e4474f8b9",
+  "blk.17.attn_k.weight": "d9125ff33a19c502c0f8846433ffc24395048582fc2f463d34a0301a82156f02",
+  "blk.17.attn_output.weight": "3d64fbb1cfef04444827f37c35fd9ad3413eb2165094d339ef89f00503f09de4",
+  "blk.17.attn_q.weight": "e5b29424028f578beca385fd82e29f37adedf3037cd51e5889d5a1ffb0428ca7",
+  "blk.17.attn_v.weight": "1809c5aaf2ac04c5d65539097564ad62796e87d24bb8b9ce5b095561a61d908a",
+  "blk.18.attn_norm.weight": "99daca58d001c627523d3adfbca1d95f04e590382a326866544d57989d5f4835",
+  "blk.18.ffn_down.weight": "84f30231ce6ca0f10227541dfc602d6418c1a210386b0c4926ef1656e7d4635c",
+  "blk.18.ffn_gate.weight": "ca5bbe4468b541740e54f69b9e08fcc8e478c344b70551dab21b1206acfbaadb",
+  "blk.18.ffn_up.weight": "0b3067b9dded31686dcfdc1e247eae3974a28a61ac59e9862758dbfaad64e8f7",
+  "blk.18.ffn_norm.weight": "8154a102232dbc0f90ce77ae5c1ff8f26f8b6e4dcf326e9ec1645749669e7960",
+  "blk.18.attn_k.weight": "25abb26021ccc481471a30e0d4cbeb7e1db29828417ec5136edeb93fecf09ac4",
+  "blk.18.attn_output.weight": "d87d481d9b046b68efa06ccdd4ed8cbf61e692d61114b75b7fad5ed75f5d87b2",
+  "blk.18.attn_q.weight": "cc6400379e15766992ff1293be79dc67682c28e9e15155a78109f4b64653b164",
+  "blk.18.attn_v.weight": "45c75cb1dd496aea3173aafe2575b841dd1d02cbe010b3198099731eb98f531c",
+  "blk.19.attn_norm.weight": "65389efc75297684773284ef8e5f8789a4504b636c9f33b8a32e0ee42499fa72",
+  "blk.19.ffn_down.weight": "4eefab7e939f64a17e4a214ca3c77a6fa110d94f677e2d6401086f70fc538b04",
+  "blk.19.ffn_gate.weight": "f1c0a59cafda66f466ab585b0b8b4861b58abe87a67cea1f6a488492242edfdf",
+  "blk.19.ffn_up.weight": "c42d045eef588db4a0e56960a57e110e1ff92eb8041107d19899165fd3b90f17",
+  "blk.19.ffn_norm.weight": "a8f33eda6d5d62ff5f333ad9771783caff556641f4e7df713451385676f441fa",
+  "blk.19.attn_k.weight": "0bab5d9e9083492bfb05a5a3bb23b79c0e7b99ef6a6644817b4d57d5c453b8a5",
+  "blk.19.attn_output.weight": "c99c551d70eafad0f7aea98fb6f9251635897168eb3895f76abf0d4ea3b3aa6f",
+  "blk.19.attn_q.weight": "c98bde95627c3b54c9443813ca50b4e14f518319681db6bbf7b2332ba26e9a60",
+  "blk.19.attn_v.weight": "ff3a490518cf64904db89ce0dc7d6eb89e870f1440e41883c6b55a221f82de84",
+  "blk.20.ffn_gate.weight": "761f0e317229cafe9d3754048ab038a0a84e9a287b196ab65f633139f2d29aba",
+  "blk.20.attn_k.weight": "45d13439b41066d282e8490a726785abf513605f46c79bd0c840f6419d27e790",
+  "blk.20.attn_output.weight": "a3b958d84b4a097844179b7d55c18fd0e4f319cb15e918c6fde33b68de1bcac6",
+  "blk.20.attn_q.weight": "127ab8e7d8c3f882874904196a02712bab42e6744fde45871b67350609d19f5e",
+  "blk.20.attn_v.weight": "5f0ad2d14a8ae42dd3bbeccfb33295687a14055fa92c54bc946249373c1c9f17",
+  "blk.20.attn_norm.weight": "77300b1755edc8c70089e0f45efa646056b9add7d8568b2324d2f3e62b64971a",
+  "blk.20.ffn_down.weight": "ab93d0e075b42e9017b701a070d561e698050d90aac4b4b9919256fbe50c3204",
+  "blk.20.ffn_up.weight": "4fd6628a07acc57a48d1ef83f81b7d7aa0bce569c1160a99d307284f8821322c",
+  "blk.20.ffn_norm.weight": "2a9e46b9e48e8e55215de56592e1f189530037c1c94a1428e3d6f106c7f26fb2",
+  "blk.21.attn_norm.weight": "4b3b5912c7bc61eb9da8e47d4651f896e85d9e59c4ecaa65df7acf3c21737298",
+  "blk.21.ffn_down.weight": "7146f931663d93b8771cd84405cd4802ea6560d0729b0d6d44588203c095bc53",
+  "blk.21.ffn_gate.weight": "b44ec5d64388fa40b90b3e9976d97a8b6800fa3b97584f32e64b03daffb8601f",
+  "blk.21.ffn_up.weight": "0cf3643fd23c685e17062cd11e116e17ce57a405e5e78953bab94cd62fe48789",
+  "blk.21.ffn_norm.weight": "4ef2cdb53da166df70b39f3e6b17af51848cfa5ea3c27ad6a1ae2a1bb1da1ce9",
+  "blk.21.attn_k.weight": "5d40f32a706f670c19972b14176bf660d5b045e3637b110dbf8d7de4ff32101a",
+  "blk.21.attn_output.weight": "18afaa916752ce16c9653ec0ec7e2fe60be55faa2aa5025d147be184adb75cac",
+  "blk.21.attn_q.weight": "2621daa5f858931514a4b2f0fe8d81cf9b96f541e6af99bfa7539e9bde8e34ee",
+  "blk.21.attn_v.weight": "63226dafc54c899bbce4aa49efceeedd8908e94faa613450fdda91f332b62864",
+  "blk.22.attn_norm.weight": "cf3058daab4d2c04387e7d169d1553bb8e7358eea66285ec067703f6ce62043a",
+  "blk.22.ffn_down.weight": "6a58d5fd220abdbac6cee7ba048abab794731af318f04982c2506df59413d0b3",
+  "blk.22.ffn_gate.weight": "d5614535324b03c7b91727a903b2a72f8d07ad17f7aa8b61ea173cf9b895069e",
+  "blk.22.ffn_up.weight": "ec20da3949566e93f66cabb67f8cd7eab399047ec6ebf5d43edfaf3669b82296",
+  "blk.22.ffn_norm.weight": "84c82f38f53a649972a44466fc476bf764e064ce18de870291edc302f3700e28",
+  "blk.22.attn_k.weight": "a3d2ecc37fde7c201176bb8abadf27f0d8ede9679a6034913e03d9db924fda12",
+  "blk.22.attn_output.weight": "5a3b8bb433f43a387df43dd371bdf80ddfac986dfeaf38e9bac1d7a0ec6628de",
+  "blk.22.attn_q.weight": "3a875cec661b4859f30a8fd2c866811184b25b68c9e36fe2663d299caf8b59c6",
+  "blk.22.attn_v.weight": "8717a83b79035058dcfd3ef6f8e5b36e71d77379e5a239e1899eef8766fb7703",
+  "blk.23.attn_norm.weight": "2b4a68a0a2f023dd646e4755c9bef17c2f631901154afd839edac7ac006ec99c",
+  "blk.23.ffn_down.weight": "29499b1586c6fc4883c9b7a9c8cf388035146b5aecf90c5c4c8c8e082c71e7d7",
+  "blk.23.ffn_gate.weight": "7d6554036d21c587b9b556428054f9c15cbef96d24b257f906fcef4ae38bd9c8",
+  "blk.23.ffn_up.weight": "19761ecb288d6ebd44b681c4535661583b1e19dc29e96d0c007333cd8f00aacf",
+  "blk.23.ffn_norm.weight": "37dc35500790a4ca33807b39cf7af65065e535dc25b9e94f3ed2759f61887ac9",
+  "blk.23.attn_k.weight": "717547d00323817b0cb40a72ec5f8cf42ecd1f9e3e42715c2cc5e38f07fffffe",
+  "blk.23.attn_output.weight": "a24786feb6a905fdf166d7500133757cbe494779d4ebcba9eb03046b319557df",
+  "blk.23.attn_q.weight": "6a2c4a98f138b928d22136efa163562691d3b4ed526d52d46a2fa2694a8f3965",
+  "blk.23.attn_v.weight": "c6e6081eb9c38a7fda023085957b460e9ea321e1fff408b38c2b58595c39979c",
+  "blk.24.attn_norm.weight": "5e6283f891e538670425f3e244b08dc6f96f33dfa4aefa913f8eb17212421850",
+  "blk.24.ffn_down.weight": "e09eb170f389deea0a4a1cbfdb52c12490768a2c60491b7bef8a4c445e2a08f5",
+  "blk.24.ffn_gate.weight": "af29d815cf49a38fc2ebd0bf9b2dd9933d023a29f2d766981acb9a1b53f09117",
+  "blk.24.ffn_up.weight": "36ccd9333426666de9d3088bd4dcdf5b624b09dca9e3a83a22fc0383f2d950fa",
+  "blk.24.ffn_norm.weight": "a88e1692318826db6ac42582d182e51a3c698c655d0e21e04fa086318832d07b",
+  "blk.24.attn_k.weight": "f7d61d6d1225289bcc502e3bbb0168b4584add0253218c1b77ac92ccef9a1c2e",
+  "blk.24.attn_output.weight": "85a1363b3ccc87312094c2195022687c16b0dad7fafb9e80bb4ec474d53c29ac",
+  "blk.24.attn_q.weight": "53482a2c008f42f4fad779ca323addc3712040149dfc12f782417756388a72bb",
+  "blk.24.attn_v.weight": "67498272369af7dd10097c73b07f731b565cfc9a559e711cc0d526389e7b44e2",
+  "blk.25.attn_norm.weight": "98dd617def5cb7825ee4833132ca2da2121245921585e1d9e36b93344adc321b",
+  "blk.25.ffn_down.weight": "7fd477d6c50aed5f424a878dd284343379cffbee8a34c0b6e55100c8305fa13f",
+  "blk.25.ffn_gate.weight": "f892c9806c8ec22e8aa746734ac9213428c534921cf161239e1d249fdb5d1ec0",
+  "blk.25.ffn_up.weight": "528bed14c9bf9762f790525ee40412545221f4321d2a2323fa8e73c58b7643c5",
+  "blk.25.ffn_norm.weight": "ca5831966672e7be6a578feeb631ec3570d3b5afe12860819ccb96e896ffc346",
+  "blk.25.attn_k.weight": "610d3068cc9b20401f0c3a0efea39a279dd9f564fde19baf3403b2ec2319e4c4",
+  "blk.25.attn_output.weight": "798aaf702e53b657265ac3b5e6caf3a0ab515bdadfeb1a3a156b4f3bfba76666",
+  "blk.25.attn_q.weight": "8a7fa25248de83029fb97b51d036a01baebe31fcb4be121ab00dd8b7de209b10",
+  "blk.25.attn_v.weight": "2a53d5e9f8a1218c66958c6388d3b37400a9af7956c785024ca44bfbc3c7d371",
+  "blk.26.attn_norm.weight": "5f44fc043481eb0771f3e6d2420bcbcf73140afb9a9feb8eddb6575452acebee",
+  "blk.26.ffn_down.weight": "944a60a409d0d5b6a851e33c69aca152454b691711a8b96f5bcc488772ab2833",
+  "blk.26.ffn_gate.weight": "2a0ca4abb3de5593e6693d8be69b63d6d1a639855ac8332a75f520353f030c62",
+  "blk.26.ffn_up.weight": "0b1df496163f9ac07bf89375d3eb441b51a81d41b47d769a04a61efc18dbe35b",
+  "blk.26.ffn_norm.weight": "56b8dd046e9be6ea71f7efd80dbd14e7fb1aa020d3cd38e063275f3873fd12f8",
+  "blk.26.attn_k.weight": "b1dabfabb970e6971c7ea6e53c63cf7ef56341e6a2edd9cf177785cad9af2f9a",
+  "blk.26.attn_output.weight": "39532c7e836baad164a655fb97ec5114ea4da37ffba9fdea2684f6e4450e6f84",
+  "blk.26.attn_q.weight": "8f48bf6aaa1252bc149e98af2be1777a5c0d2c3274c6d314171ea9344a41b604",
+  "blk.26.attn_v.weight": "02fb145f7fd905133750e90571effacadddfd3f4966552dc59982ac3900ab8c4",
+  "blk.27.attn_norm.weight": "654d168fc3cab716d91261f5719f180b7d697218401633b4878a759f1b5283f2",
+  "blk.27.ffn_down.weight": "2823272bec3a1c12f02cc4cb24aa4031abd7e9dbe0b02676e2305b21671818f0",
+  "blk.27.ffn_gate.weight": "b1a1d40cd02f97182cac17a79971d1934ee0daf3aa0bf11303568c636e208a64",
+  "blk.27.ffn_up.weight": "ed62ec72a020d070e64eb7b50237b32213944727b5b2427f45d989f50df5fb2a",
+  "blk.27.ffn_norm.weight": "c69649ac65d694b306a905dee8b03b89eec1ed188b1eaaf38f8e29d4b12e38a0",
+  "blk.27.attn_k.weight": "cc57bbf413f1fd227128dc66efc8590c73634cbd6f96d01ec4878b5e7ca6a925",
+  "blk.27.attn_output.weight": "cac407ad02361d53207b3c7e25ceab84dcb4347b8087055162e2efe14d11d84a",
+  "blk.27.attn_q.weight": "0af18e07cee12015761c07c94407024f4f4d77d97bdb24163db0e16669e2cef3",
+  "blk.27.attn_v.weight": "a1d08fbdfa40af773c5adcf93bd68b78a44ed144e3fc6bbeb8af02e937527eb6",
+  "blk.28.attn_norm.weight": "f39a51f814512b040a1082143150e4a49ff730f85cef49d7f77fc79d83e91f40",
+  "blk.28.ffn_down.weight": "74f29ed51055d1c1adb8f0660bbe538a27e016c65650f2d67efc6f1c84fa1b45",
+  "blk.28.ffn_gate.weight": "ae48bb16487ded6781c60aafc0bf738fb4ae15729952906f247d216592ce249a",
+  "blk.28.ffn_up.weight": "543009727718ac22f11ee4b17815f68ea6f15ba1f3e7ed5ecdb755cf6417565b",
+  "blk.28.ffn_norm.weight": "b8f9e54c322079ff20a82b88948cdc2916c22c7db40b9a9ed6d3cbe89efb727e",
+  "blk.28.attn_k.weight": "55d055ba653b728d6e784f9e013786fed07115c9fdf23367e3941386d5e77db8",
+  "blk.28.attn_output.weight": "155101c03ddbf18f4fd0694bfc982f33c7bae25c9b087d6f5273c2bfbffcf2c9",
+  "blk.28.attn_q.weight": "1ed19bfdd22e9c14eca014739982492e9516d411515a8585f65cf754d849e53f",
+  "blk.28.attn_v.weight": "11ba854dd575c025d37256eee9041f6d1bd2b549a083d6409a09bfc1542913f3",
+  "blk.29.attn_norm.weight": "02b0bf5e2fcefd11a153cc988c81ba672682e4844fcf6442423e21a0e10d566d",
+  "blk.29.ffn_down.weight": "594bb692ec2779938721ff4748666ca8370e0e4fe85229503f616438b8884f5f",
+  "blk.29.ffn_gate.weight": "8bedcf47e91dcb2cf4093de56b048ee411faab6ff472f89ab2c9c113a08e6967",
+  "blk.29.ffn_up.weight": "e241a547b5fd6dfca8200b8141e21c1c487a96cbc4e5855f181a7ed1be91b642",
+  "blk.29.ffn_norm.weight": "e63eba5e4c6b288bfd9f15e46e236086456c8b7f1f9c732c0b5de84962a2e7cc",
+  "blk.29.attn_k.weight": "afe5979d5bcf211aebb526620f5974bcb0a2c39c8be71e815575c55d6385e3aa",
+  "blk.29.attn_output.weight": "9c944ed44b124b014906fc240afd3b90aed56bbd9567f2eddfd5b7a685b3cb48",
+  "blk.29.attn_q.weight": "e234e08e5c1bd9245a2edc8d63e9933b6b879f97c01392209cad4f55f05f3ada",
+  "blk.29.attn_v.weight": "5cb8e3e5f954e775c5a5e4de7a9a62b17e9c6931bb0ff0e2f82c4126fd3e1a1c",
+  "blk.30.attn_norm.weight": "a65483ee51a0b214144ec8a14f28ea5437586e9e12ebe342a57d1f8627ee12af",
+  "blk.30.ffn_down.weight": "417959da77ceb33ead4271cbb9428b195196173a893c44e52880a7ec61b4856b",
+  "blk.30.ffn_gate.weight": "a0d503ffcbe45dc927600bb98c9f6082487e65cb577ab545add400d666a87638",
+  "blk.30.ffn_up.weight": "f8ab957b82ffcd10b21303cb5e866209b6fe95f827b1b94e9a949207952d12c0",
+  "blk.30.ffn_norm.weight": "210c7ceb0514a9ef27b5d4d1b3aff6dde43f1af0345a050d71097940e0e73e03",
+  "blk.30.attn_k.weight": "16861b9abcf5a3fe73c93d977ca45a1e6daa65be0fd85c2cff53486ce2033afa",
+  "blk.30.attn_output.weight": "ca541fb2e57e2257118c35784845b0c731278af8db3036ac53d71aa1681fdbdc",
+  "blk.30.attn_q.weight": "f7834917748e26bb456b945e230bc926c228e93696bc01fbc2b134bdeeac71a1",
+  "blk.30.attn_v.weight": "9292783171dbe5eb689d17c9bda11e537f0e9b328fced6986c938d61ed590e81",
+  "blk.31.ffn_gate.weight": "e4766a04bcd8f937ba883c6a144101e546747804ca66c35c97281d6ccb47b566",
+  "blk.31.ffn_up.weight": "cc1e666116f7e6b06736db4aa4b81003c583f54f4d9200bfa48842249940e16a",
+  "blk.31.attn_k.weight": "fc80b57557687504efae7d24265cb7dc39b8f826bb3d897a11783012dbedc44f",
+  "blk.31.attn_output.weight": "215617f50a1f5d9b2250b82f3652b35a9e9aa0ad9ef2b485d73965a14b2b872a",
+  "blk.31.attn_q.weight": "274b4f1dfb0bdec28632705677049fb3e327ce6d9e1f3baaad1560439039982f",
+  "blk.31.attn_v.weight": "e641b8b926f9dfcbbf6b6da1c02555525ac4b1c306d96f20cfbba7d6662c4e56",
+  "blk.31.attn_norm.weight": "b3243c361d4041ddb892ce6862dd5091f57d87357e3c67e177451b85d8baf34d",
+  "blk.31.ffn_down.weight": "0a00cd3ecd5e91624a27f9e239b1de425d5ba3cfff82c256a11a4ad434abf3c2",
+  "blk.31.ffn_norm.weight": "2a0d67ea2bb1303975712243f07273c92fce83baa11b1cd6d8e42e74ea3c810b",
+  "output.weight": "768615f077fb797967844571c58b94d7c399d884d115be3ab4b0154504cae892",
+  "output_norm.weight": "7cc5b7ce10e5082000fa00bfa68af8c7c5da218e59e2c41cf2f1499d40ca229e"
+}
diff --git a/convert/testdata/Mistral-7B-Instruct-v0.2.json b/convert/testdata/Mistral-7B-Instruct-v0.2.json
new file mode 100644
index 00000000..1da4d2ad
--- /dev/null
+++ b/convert/testdata/Mistral-7B-Instruct-v0.2.json
@@ -0,0 +1,313 @@
+{
+  "general.architecture": "llama",
+  "general.file_type": "1",
+  "general.quantization_version": "2",
+  "llama.block_count": "32",
+  "llama.context_length": "32768",
+  "llama.embedding_length": "",
+  "llama.feed_forward_length": "14336",
+  "llama.attention.head_count": "32",
+  "llama.attention.head_count_kv": "8",
+  "llama.attention.layer_norm_rms_epsilon": "1e-05",
+  "llama.rope.dimension_count": "128",
+  "tokenizer.ggml.model": "llama",
+  "tokenizer.ggml.add_bos_token": "true",
+  "tokenizer.ggml.add_eos_token": "false",
+  "tokenizer.ggml.bos_token_id": "1",
+  "tokenizer.ggml.eos_token_id": "2",
+  "tokenizer.ggml.unknown_token_id": "0",
+  "tokenizer.ggml.scores": "e3d3eea80bb41a1213f2d0aa3e8a38581d1f19323be77dbd779c9c7e3b72e676",
+  "tokenizer.ggml.token_type": "6040635e6bd38d98af06698feb75c1802bad35180ee6ae0a503e38c0f60fd71e",
+  "tokenizer.ggml.tokens": "604ac4bfbd019e430d7b6cdf18c6c0cd5b967900601f0307f714ec7773aa5ca6",
+  "token_embd.weight": "cde834ccac5e94324b25cb81b02d27312cac0c551b55a7e1d555d90bf6cb6e81",
+  "blk.0.attn_k.weight": "458bfdd9715c66e017c2447b1ed3c582963a3111479314e664faad8c914f42be",
+  "blk.0.attn_norm.weight": "e1fd60b95f713bae7b7e3ca933c64ae6c9cd1e8d808000204bbfdc19f0ba635b",
+  "blk.0.attn_output.weight": "df13b6a157d9d4f96c53b012b3b9bcd207d0c94144cbd22ae3ec13bb07d6c373",
+  "blk.0.attn_q.weight": "13b4126b4245bf06c915a93317c42b8174e05053535ec99dc576541e4cec7c25",
+  "blk.0.attn_v.weight": "5b1781d3a341214511b27eb4e268674ea3ea829dbdf8ae5a6bb89b3c0b33fafd",
+  "blk.0.ffn_down.weight": "49186f5d8148d316b07458841d13a2e66587f4af69b776188a809591ed9c070d",
+  "blk.0.ffn_gate.weight": "4397e30ece09136f00f4ff84ff49e5241b765a374deb8c5a12e897e2bf73473e",
+  "blk.0.ffn_norm.weight": "43260589aac3850a779bca3f9649f793bbfbe5db538361cb743b3830217f8287",
+  "blk.0.ffn_up.weight": "fd7ac918240a07566f6967527ffca58fcf433a30b78fdd6d84b2136d4ebd9987",
+  "blk.1.attn_k.weight": "209839566c7d235bdc20565a4766378b6ee8553133a5a3315abe8a85baa80712",
+  "blk.1.attn_norm.weight": "58c52986f7c69784ba327cb7f350923420782bee17fa39b1fbd13839d4005357",
+  "blk.1.attn_output.weight": "5067cc628449682665dfcf59b16e58fe2a9d2a81cb099f0fcd42f4f8670c6740",
+  "blk.1.attn_q.weight": "f410f9f0dd5edc09401af597d02e2a4c727f1502ec3ec3898321617b36c6df6b",
+  "blk.1.attn_v.weight": "d40fa49e07c102c0644e130e7909eaa93ed0d54e2edddc0759e721d58a4e4f5e",
+  "blk.1.ffn_down.weight": "594b1eff6ed4defbdd819fabbe2d48764984f08878a860bdb808511d5a25b8db",
+  "blk.1.ffn_gate.weight": "4cda97541e388a5bb607ce4cc8b3db1da7045830a630e7ba4d17807befcff346",
+  "blk.1.ffn_norm.weight": "66c13d7481be65b97aa474735ddc9674f33d512ddda76fa6fb45c7464b09f1ed",
+  "blk.1.ffn_up.weight": "1adc6de288ba4cc1237833ca8b4eb81107149842e38bc452e18e5cfe284338a2",
+  "blk.2.attn_k.weight": "5420423559f236ab22d85a00849f31e0cc6e9c7dd879de724393d8cd2b379153",
+  "blk.2.attn_norm.weight": "495fe1ab40cc52aa054ddd4f0c2d2790f4326c8d103296b1b38f3b1060db2a24",
+  "blk.2.attn_output.weight": "ccb83e7085381f558bfd65588c525ad2671feddcbc3887afb4038ad9c7aac348",
+  "blk.2.attn_q.weight": "2e8f77478392bc93c2a391f2e0f4a173a952bbab88a7aca099c6ee909726409a",
+  "blk.2.attn_v.weight": "d64512590f3b7ebbb9e77c2eb97fbda90b00d45c944f2b174f03a2cb11007567",
+  "blk.2.ffn_down.weight": "1de5084a05dcaa6b1bd926e83517dbe9ebe7fde79235fe56018b3028b1aa6397",
+  "blk.2.ffn_gate.weight": "cbea526b557f49aad8c976973cf367fcd12175b900f551984f498b9e07e4b7fd",
+  "blk.2.ffn_norm.weight": "530aa49b10c7eae08899d143409240deb95dae4e1d5bf78cea3b26393cff3ba1",
+  "blk.2.ffn_up.weight": "13a5fc19b96b4dcc1e9bd01998c8272ebe52034c1933ed123a506b711fae9a5c",
+  "blk.3.attn_k.weight": "1913b63a73305941d8cdc472e7f101c633d3357a78602eac0a4b49a744261075",
+  "blk.3.attn_norm.weight": "9c11bed5ab41f4adbfdae4ead65b525c8f19443e656a8c61ba412a4e1ad1193b",
+  "blk.3.attn_output.weight": "bb0b42c1d34779c5943272ed71f1dbb31ad8edd75f8bcd5c868f88505ac3a610",
+  "blk.3.attn_q.weight": "3461a1fe4e49f5319ea047cae98ccdb46528a3ec23831183fe87610b48c94948",
+  "blk.3.attn_v.weight": "82aa30be6a61526a41fb79bb28a2617416f5909f0477aa9e95e16be9370fcb38",
+  "blk.3.ffn_down.weight": "68521011ae03f5e3b0966127111afa8ee9f2eaeeef8d3a0b86b633e0332e9fbf",
+  "blk.3.ffn_gate.weight": "1e89e26338fd364bb679695968c65106382f15ad55c95cbb5ec9bdfeb766f432",
+  "blk.3.ffn_norm.weight": "c81932529a5a8c417c27b888dbe95fff8b447c2ea5f6f560444ec5d50b93832c",
+  "blk.3.ffn_up.weight": "305021735afd8669afefd713f56137248d5e817e60471a112ad06b7fa07ffe88",
+  "blk.4.attn_k.weight": "cc26ba5c5c28082a79e6abfe61186029e80b145252ca6a7924c437f0bcf2d51b",
+  "blk.4.attn_norm.weight": "302d251fdcc91f7468cf33f80b49484251d8917d7018ad264ab3a85c8ecf9ddd",
+  "blk.4.attn_output.weight": "a012f5bee3520cd4ce51f0076c132ebc3653309f304032ad051aa308f55f36de",
+  "blk.4.attn_q.weight": "3c8d607e447f5ef21e73af71e3c0d32fae16f91f31faae34ff06912cf9cb68fa",
+  "blk.4.attn_v.weight": "49f6c81a634ce46d71c2350206ecbd231b1732af96e4e4e67693c41a07e007d8",
+  "blk.4.ffn_down.weight": "e89504f311a4a34dc819a67b761022f14d71c43df3ead4f892c87aaa8e9f0adf",
+  "blk.4.ffn_gate.weight": "18b22f079a2fbaefe3572eec61fdcd996fd747724e2f0ff4f08cfcb43eb7bfb6",
+  "blk.4.ffn_norm.weight": "22415a492c168a0878912b05c854a631228b01c3ea8842e1d75989ec46c18a65",
+  "blk.4.ffn_up.weight": "f57379eae2874d8853f14ddf0f0fcc4ff1338574d5ed5d7e88331d5fb84f5642",
+  "blk.5.attn_k.weight": "d627af853c40bddf9762ce3988008c1ff17f2686fa8f73a0b5da38010147c316",
+  "blk.5.attn_norm.weight": "9ce01092c7f7f1c3ef72d6b794da12d77aa1f6a24fb96ba1b9bd5a0bcc3e2443",
+  "blk.5.attn_output.weight": "0388da8064c4b6b795ce2d8079e8a36535e82b2c9cf794e38ce8ae460aae726d",
+  "blk.5.attn_q.weight": "039b7ce1c909761fdf475c06cf14cabe5a90199282c89e4dcf460e95a4b6275d",
+  "blk.5.attn_v.weight": "c47bfd8d2496bdb6e00e03b903e15fd0ee806a515094ec257e43cc433147ab7e",
+  "blk.5.ffn_down.weight": "1d62e6708974bae318cbf00a8bf621d9ba0537e549ce4710a536520a8d14168e",
+  "blk.5.ffn_gate.weight": "8b42b1b11c92db19985094cbb50434e3a7c9cfea71ee6f21ea79eae7c49284a5",
+  "blk.5.ffn_norm.weight": "e0bc520f1505e687ec391d632a381d38d8ebcdec19f614a11a2000ab573e8b7b",
+  "blk.5.ffn_up.weight": "8cdcd17d2ea89bb9ab902dbc6bf3f827fa4ee029c6bf19eecbdefd146d8b6f2f",
+  "blk.6.attn_k.weight": "5dc6bcff89794d1756bf57ec665b58622d9352130d31082a6c66e1a079f99932",
+  "blk.6.attn_norm.weight": "13b26008abe0f119b5104b9d78ebd5e797d3cdd68122b93d73a3b4831a54d085",
+  "blk.6.attn_output.weight": "f5a49917ea70c3fb311ccfffbfafa63ab18416a5d55e5429b70ce8bfba57c075",
+  "blk.6.attn_q.weight": "d9c2f652c87dbd09ec3822e12876648fa32e86553ac25afab723b1cd9f8cef90",
+  "blk.6.attn_v.weight": "5ecc5fe67609a35151011cb526f45c56fc0a999079ae0ff37c755ca03c68c555",
+  "blk.6.ffn_down.weight": "0ec125ae0ecb2d9277fdb1b04f17efee94e37d0ae37311057c212ca2db3fe6d1",
+  "blk.6.ffn_gate.weight": "fa4d6d38355ee8aa3b80b476d65ae7e343c9b7770d7b097fc848ee8a6e091d1f",
+  "blk.6.ffn_norm.weight": "30e8f7defc627532e1739dc76d31223d45767391a431f925b63dabe334b0f392",
+  "blk.6.ffn_up.weight": "6b97cc32b290fa9087806b5d65aa6dc1760737730c8c71394cc4f30c2157f9ab",
+  "blk.7.attn_k.weight": "0231cb127cb7c3714cd72b8f39343891d7715a9bab2237ade9e7bc5f4ed2e68a",
+  "blk.7.attn_norm.weight": "7c3187f07eead7d219d98ab2daf87905e88d5f1ace109b6f5fa55dce3914981f",
+  "blk.7.attn_output.weight": "2f30ad972c284ae7c8eb0482053433495ebe8fe9c5ee2c28b4bc4ed1f33050fe",
+  "blk.7.attn_q.weight": "3a2b4b8d61cc9956d304fa9f82a9e65b4bb9fda2196670b16df7e0d8c43eff2c",
+  "blk.7.attn_v.weight": "d2aab97d0dcf0f61dd2f32848f7a8a99c423a4948a660a660a03a546972b8db8",
+  "blk.7.ffn_down.weight": "2270d520468c5549cd30023ff9c452a277058310104c4239a616373fc5a94387",
+  "blk.7.ffn_gate.weight": "4134a3ef71b3eac8f76b6f1a2e58625b3bae48081f175994bc3ed7d8b0d4f2d0",
+  "blk.7.ffn_norm.weight": "42df4abd4b8769b16f3930068f96960af1b061f1aeb7505384f272233b2badff",
+  "blk.7.ffn_up.weight": "c920549054ec16ff8c73a72f5d837cf4e11885e44db57c1c1c584c18fbd7a9a5",
+  "blk.8.attn_k.weight": "01c609bd3bf31ce65688f1f640ee413740e821330134d4ed1877a3065d1527d5",
+  "blk.8.attn_norm.weight": "48857411f769b00290f4e4f2e593e092781fdc2503f80c1e3eeda1b85a20f74d",
+  "blk.8.attn_output.weight": "90fb273f8df83744554bd59236515c16c5a5a698ca3fbedc17cc89ddcee354ff",
+  "blk.8.attn_q.weight": "ade617ac4653c7f00593dbb51837a468afef20a14eaab3780fb96ac3d6714369",
+  "blk.8.attn_v.weight": "c2c37496494864fee5c527d1fe1f88529d31c73f9cbd02ef9b2e9b23611ea50f",
+  "blk.8.ffn_down.weight": "2da58572e9ad79087c03cbb0c23c9ef69f93ec221fd5fe4ed92fb93871d23ffa",
+  "blk.8.ffn_gate.weight": "4483294e628edaa4901708e73e92c917bdd93b780fa01aa74aed57166f2bbf0a",
+  "blk.8.ffn_norm.weight": "c0cbb7a4f8123b62f0c4652a687f3b394802bc32870dc446eefb709e42043a7f",
+  "blk.8.ffn_up.weight": "9eaf8a2060cb9224cd585997cd671866c4051ad885c2c6d9fdc7056c2a5c0d89",
+  "blk.9.attn_k.weight": "5dd36c45fbc9c50fd35c36cd75576288506971eac5c5311d4f5c16ef60099645",
+  "blk.9.attn_norm.weight": "3c8ca64f2f75ed7c8fc1da010c23be787648139a96ca0ef3ad10be7b14942b8d",
+  "blk.9.attn_output.weight": "6277e1f833024f53c409be919ec76d34464a78b278c8f9dbf79e777746e3b995",
+  "blk.9.attn_q.weight": "87352b70d9e328c2d51d59090cf5ea5a046529864a890d0bc8986447a0a5c006",
+  "blk.9.attn_v.weight": "2efdf01161d7a82a9117cc2d87d37dba5ffefcf730781cb94fcc95130e48ff9e",
+  "blk.9.ffn_down.weight": "e7658a2ca984961c7ace16acb679387bedb1fef656b5330bbbf588db19673a75",
+  "blk.9.ffn_gate.weight": "773cd330d4ff5d64be8af00adf2e2722fae4e33fc26bb9d03549f6f4b3b0fe57",
+  "blk.9.ffn_norm.weight": "c8b86cd5c43b332f72060b807091c33a258e5dac01358ff4733b916cd34c9c97",
+  "blk.9.ffn_up.weight": "d8cc3bcff18bd46124ba2aa7caacc71220b44eeef6fccb993b4c6cb53e8f2c3a",
+  "blk.10.attn_k.weight": "964bdf3b4e77b915a216f750ff7b0f2eb1dd6bfa071358aef21010b90111044d",
+  "blk.10.attn_norm.weight": "59ed411d91d14775764eb514acb0895a75a10cbbfbc1c15d453bc50f8046cb7f",
+  "blk.10.attn_output.weight": "4d35a2a44cfe4ac0a83fd3ab0dcf1f5a0bf54cdb3b7be9fc353ed32c8a3eb81c",
+  "blk.10.attn_q.weight": "defff5339450dd881ac352f5c459293f39e07b9619ebd10ed632d79a3f310278",
+  "blk.10.attn_v.weight": "b9803e8d6a54acea58f662d4c0a5c8ebdf986676de7dfe12d4b288937881ce93",
+  "blk.10.ffn_down.weight": "eba856be64e4be20b92fb4639a783454dd92427250759df92a337e39f1971c08",
+  "blk.10.ffn_gate.weight": "2d5c509b066584db4de3632b01234e86edcde35409c5ebce18957dc80fe465e3",
+  "blk.10.ffn_norm.weight": "ecb9a8679945ff0273856624ce435dd250ffe5a440ea0861a5c84f0e4c44d2c6",
+  "blk.10.ffn_up.weight": "e76ec7e993f399af02958778c643aa78368e3067846714165eb5aba9d5f547f5",
+  "blk.11.attn_k.weight": "29c6d1f34bd3ba2f0904e57b32a5bf8dcb2834d439159a33edf234ce0b775677",
+  "blk.11.attn_norm.weight": "b5817b275149cd2abe18a6a10e19854605fc58fd364666744362ceee8cfe49f4",
+  "blk.11.attn_output.weight": "1e05653220e237cbe0cc770033e183c9a0eed5680510997409b16186c6691950",
+  "blk.11.attn_q.weight": "03db725ae669151e4d536e50285b3b047ad097f52475df208ed3e790e31a44be",
+  "blk.11.attn_v.weight": "27cdf1d4e971326c451a4615a0b79a8c7fe9508f9b76c0d52fa01971fc7eb403",
+  "blk.11.ffn_down.weight": "176938cd7c2966094f614cace8ba568b10532e45a0d438f80eccd19b6c2a7f87",
+  "blk.11.ffn_gate.weight": "9782339915dd6fa70013628a01524ee1d01ad8beab04068da7ac6a5ee7603a60",
+  "blk.11.ffn_norm.weight": "8245f6391e3be97811c0ff27f0d8f484ecc82a468a837c893f059745bfcd95eb",
+  "blk.11.ffn_up.weight": "15616ddde096d0d25e906375c548b6de4bd5576d1f6b68eefdc29f14e183af42",
+  "blk.12.attn_k.weight": "66dd21604993edd1b1fe547bcaa06f5bb7e31c9204902d147a227e4badf7feec",
+  "blk.12.attn_norm.weight": "23a69f85dd8a0904b9839cc5d0afcda299b74e82ae2642106224a1c820f2b761",
+  "blk.12.attn_output.weight": "4a98d132e376beb274a39d4ea9b6a1b870ad5c66625439d7ff6f45c229c3ca04",
+  "blk.12.attn_q.weight": "1c6c309d63afcfde32fe37257e300a78e25d01117e33490801107c0e75d1ea66",
+  "blk.12.attn_v.weight": "723d9e4ebe4e2b1974afa01d8f512b52933698fa36717dd47b37b07760c50a10",
+  "blk.12.ffn_down.weight": "00e0fb09e1f1fbbf3803f1dee373eaae7a93756b6e13063ab77f9927bc6f996a",
+  "blk.12.ffn_gate.weight": "89159f7f97aefb1e100107e3ac2d694e1008ad873f79bb953d60c2c1bb22724d",
+  "blk.12.ffn_norm.weight": "5f70aebd0e43a39d6373d8658cc670c13aadd7818831d3d84f761d5f688442f0",
+  "blk.12.ffn_up.weight": "faec21b446f061eb4dca561a3180712724347b77a71eb312e7afe9be9e89fa04",
+  "blk.13.attn_k.weight": "3d440825d19eac3b1753b34d94fee2b3a3cb6636c10b2703ffcf688d3c1eded3",
+  "blk.13.attn_norm.weight": "47b575e57e410738ad13fd3c74bb49c06b3d31030910834ece509cd1a5c6d9be",
+  "blk.13.attn_output.weight": "05436d8e613f4475741c1798a7c371b53d61b229507fa04fe23c504ba1f0e12a",
+  "blk.13.attn_q.weight": "002b5024ce520da41256e3ded5cdc60e5ae07ad9b202cb19d76ab511efd02b1b",
+  "blk.13.attn_v.weight": "c1f2d6763587c50312cee0d7140fa2c7ee326f5b172bc99b2d8946e08329cabd",
+  "blk.13.ffn_down.weight": "b5c4e0d8a3ff96cd76a135e415b89f02d28c28f7f3c16a36af31ef0ab8773da5",
+  "blk.13.ffn_gate.weight": "ae06e9e3d2e1f64c7ad23a4009dc904c2eccd7241f9f91c4974ab2504f116be0",
+  "blk.13.ffn_norm.weight": "e44a22321bcbcb4a3c345b504e939e8071370f54a8cd702fabdb40b97e0d7683",
+  "blk.13.ffn_up.weight": "7e6f366d538e21ad431264b12c011892d0be9dfe4c4da9f730af677f920641ba",
+  "blk.14.attn_k.weight": "95492d6417952ec24b2cab87bceb750fc7e95ac6b1944fc328a3852d980164be",
+  "blk.14.attn_norm.weight": "6b7b09e1c51addcdbb160ea59edf032531421c520ec5645fe1ff9ca4180cef54",
+  "blk.14.attn_output.weight": "75887474e4d72c218e6ab0f69f1bf3ec3dc414d51b36fc59df00cdb23421bb6a",
+  "blk.14.attn_q.weight": "940e33f76e48c21215d19e8a21234c8246d4d084381a7d9806aecb24b071d5bd",
+  "blk.14.attn_v.weight": "c58601cf5a9833f80f7f9a5b2656e8eab5eb133211446ebd48f8be15fed4ebb9",
+  "blk.14.ffn_down.weight": "f9f886e7f9b2a54d717b08947a25a0a93e8c2a5b8bcd5a907c06817c8ee3ac11",
+  "blk.14.ffn_gate.weight": "727ed0ee68594a3f59d704ed3240b6929f083b9c36650fb848d182315737245c",
+  "blk.14.ffn_norm.weight": "bd2471008ff1b2bae9aa26bea019393fb2bbc5b9493b8cec3ebd2c280fca24ca",
+  "blk.14.ffn_up.weight": "b006446769f51e4f93b503c4727deae897bc1fc7f4fad49f85024b63c4548d38",
+  "blk.15.attn_k.weight": "23bb70f9035356624039547a603e46be7d1e4403616eafc2451cc09c5373d522",
+  "blk.15.attn_norm.weight": "718cb371ca052eeb3bfac6ac506abb887df125271821fd171797a7f2d8dd6313",
+  "blk.15.attn_output.weight": "c76a2695a204b43a8e5acfa5720590b5d449a9ad9e082cbe3e80fab5903ea16a",
+  "blk.15.attn_q.weight": "2b3e4037b9e91bdd26d6e8d904cf39f948192dcf09bb6445cb55ca058d4f4626",
+  "blk.15.attn_v.weight": "7c15e89b6acafc8619e86aa9d412f5893ab17843ff2cfaf40eea9637b24910c6",
+  "blk.15.ffn_down.weight": "e16fd4bdc6d1c1209c6b633454df4992870c8cefb2cb0e8c92a7e489e9fb5d19",
+  "blk.15.ffn_gate.weight": "95a46bea366c260337c537fde06b4cbeaeec52484a69c3390bb1d178eb0525c9",
+  "blk.15.ffn_norm.weight": "37730293f704da265dc6d1896b3be00c39c0a41dab07f573af39dc30a481d623",
+  "blk.15.ffn_up.weight": "ba74a199da2d0875d7410824238c4ffafbda3993568812284a72b8800df91f15",
+  "blk.16.attn_k.weight": "f58f79a2a91c9a763adefce0c53a71eb5ce6bd8442f4af554b04b58083bff27e",
+  "blk.16.attn_norm.weight": "0c16e41b95e81978e0e0e3b338e2afe2d297426578cacee94de15df74e94eaad",
+  "blk.16.attn_output.weight": "ead22fc337514e4add49aee19720008558e52090466866e849671953a1fccba4",
+  "blk.16.attn_q.weight": "ef59c4e8fe8918c1add43d7e9c6fb3ef799dd3e1bdd731ec7b6a4a6f97c86048",
+  "blk.16.attn_v.weight": "902e6b84c2b64241470b13e6f412f859f66b4b223bcfb9c15d5cb1106b07ef3b",
+  "blk.16.ffn_down.weight": "2ad6e9eb4d8372c32a554395d460d17cfb02d6dbcb757cc962b6bfa36db4f5ee",
+  "blk.16.ffn_gate.weight": "825b2d50fcce3dbe6a5d8d8a50a95466f83ca4a10343efe67894c20b4628fb15",
+  "blk.16.ffn_norm.weight": "3bf6ac90befb0e17e077c8ea9454a8485a30f89f2d761ec7751b60c90aed1af9",
+  "blk.16.ffn_up.weight": "9fbdd08739b32411f5ab0252174d386bab19eb0b17884862f760429b7d41d78c",
+  "blk.17.attn_k.weight": "4033398718bf3674830ed1b73071ed8482b6dd4ef27f31a6c5fbb998321b6c07",
+  "blk.17.attn_norm.weight": "714f2e8ac9592966a0f1c02ee979eee8f84586405b992e8ee9543e840199ffa1",
+  "blk.17.attn_output.weight": "b6bbb618597d767b8f535117be68f92911e4a71d4eb4d8b5d943444151445ece",
+  "blk.17.attn_q.weight": "b84a0dc00ceb515faa2628125dcec502eed923077b21cfe900a4ff16c2e5f9ed",
+  "blk.17.attn_v.weight": "4387c7d6a17da9cc7a6bca8f4a75618b20407d570792056283a8e93b6ec65f18",
+  "blk.17.ffn_down.weight": "47db95c6f1e12b399c3eaf9ddba261782dd71173dd163b52af96541cf87b5196",
+  "blk.17.ffn_gate.weight": "59abaded0aedfd12f01df81f7a811e84db6a227f51b60abe9a247ca726e87392",
+  "blk.17.ffn_norm.weight": "b7e86445be5c7b722e01ddb98d5c7527ca86cb827ce0354f2c269e0f2558751e",
+  "blk.17.ffn_up.weight": "8e31c293bac649d2f60da4b3fc4a3acdce1111ec6058d8805eeeb242443011de",
+  "blk.18.attn_k.weight": "5ce762ab7b032511c131df81093b587871718c7097f79d8e07d707571f18a47b",
+  "blk.18.attn_norm.weight": "1f52cdc7af1f4dc1f0ef6ad1ad02e18cda32133654e57cfa9c72ada9c0b1d995",
+  "blk.18.attn_output.weight": "6486957f30bf8a88516e25772c6650f98b13923f490a2865a8752e36439d1cfa",
+  "blk.18.attn_q.weight": "93621c8abf69d2ca29c5207180eb628fb2b544d89de6c4a7fb0699be95534899",
+  "blk.18.attn_v.weight": "11604083b5a74828ac1d226af015ad5dc0215a1fdca44fa7131c2163c02d8156",
+  "blk.18.ffn_down.weight": "8f9997feb94385f106915df810239c9753b31efda2bf14bdf18a9fbbeec8233d",
+  "blk.18.ffn_gate.weight": "427c213b3a4e94af703429daf2f65766f70424d8230c123e7e712a18bceb5ecb",
+  "blk.18.ffn_norm.weight": "c45d305c4ea6a54013ba112f12dafaade064a32cf01317373464a3618d8ba44a",
+  "blk.18.ffn_up.weight": "a2811f2e73ac9eb9cce91a21a454e84e230a155244e2cd73f2c12aad3c9b8cfd",
+  "blk.19.attn_k.weight": "b2daed159925eac58c291e2f1e2000beed21002b03c9e1bc7e7a52e22240666c",
+  "blk.19.attn_norm.weight": "6307306ede2ab5bffa1bcac3f8b139354678c0376b1d9f5530c1fcb4268cfeb4",
+  "blk.19.attn_output.weight": "ebb98218b2a9c84d3fb6baeb02c5df264b7ab80d994d1098ba1cd47aa398effe",
+  "blk.19.attn_q.weight": "4f10df2ad09177e7528e9456039b670d07db22940a49417101b725d239c16724",
+  "blk.19.attn_v.weight": "30f1efc5114badaeaafa91fa466dc7fa14b1616db433c6f563ab851f7333a5dd",
+  "blk.19.ffn_down.weight": "be5ec7fe6b48855cd0015b0e430d1b70c620de87a7ff188c7c1afef546d7b6bd",
+  "blk.19.ffn_gate.weight": "10dffea4213881f8a9b583ee0fd370e033756d32255ed15053f794375b9400e9",
+  "blk.19.ffn_norm.weight": "e75cd24ade45dca78fdb0cbcaaa2d4a17d83a5a73dcc94ce0ec2d68fbdb2a881",
+  "blk.19.ffn_up.weight": "63e81bdb951410ffa81bcfba1b94a679ec9ebae59cd1623ce2651ed5d4c78bfd",
+  "blk.20.attn_k.weight": "c2fc5ad39e9bdd45e73c6e54aecc474388d944c4be1ee1921b7fcd035bad02e0",
+  "blk.20.attn_norm.weight": "aaa9169171937bdce20c1f057e94e9252f221cabacf1ced12e11b9586f23d308",
+  "blk.20.attn_output.weight": "a9f4fb496e4bc053e3f6cf2e72e22d4cd2b545ef6c32f7e782c2ef6ebcc21d4b",
+  "blk.20.attn_q.weight": "5a07ac619ed251494170b213921ef3fcc4c2712839da262516d9d5b8ea1ff185",
+  "blk.20.attn_v.weight": "d6689473105d241eacb17f09f06000ee237336916cf5ec4f48271c5b41bcb8e7",
+  "blk.20.ffn_down.weight": "74be38db51df736f26ede7c6b52ea787e385f181cb66231e2cced4556a25c9b8",
+  "blk.20.ffn_gate.weight": "ea91e06dc3d051c0ba0243b5a8bb40edbf254eadfb54fda7247e05cfdd88cbe2",
+  "blk.20.ffn_norm.weight": "5fbd357b3d6f44a7a91e8a4fc246b24303891b7957e0f3c32818ae5dc16ddd8d",
+  "blk.20.ffn_up.weight": "fe3290333e056af4ed12942ac72aeba97a6b562e2db05e79cd35dd07eab5b101",
+  "blk.21.attn_k.weight": "201ec6ee95f06ea5eb80fe86fd07bd016d3ae9ab6abd25d631834414e14a010e",
+  "blk.21.attn_norm.weight": "ea8154f93e06485828475a00b98cc397ac84768dd70e06ecc0c075b5712d7276",
+  "blk.21.attn_output.weight": "9f8af74d531478fd304723fd8e4e01578db598441b80dc7c960cb801dbbc501e",
+  "blk.21.attn_q.weight": "277de9953a8d3cff894ffd06c15ad0ee1407e319df0c1a693d4f45fa9c74ac7f",
+  "blk.21.attn_v.weight": "6bfdc16cfb898909b7788ddd39dd04b928f31d6732772195d53c558004638dca",
+  "blk.21.ffn_down.weight": "173877146cb94801157796ee9e5eecf3f46acb3b5e797f90b83a3fc22395eb30",
+  "blk.21.ffn_gate.weight": "53146713e2ca1be80496024077a028f6b6d749b02e71003c349e113b436f48f4",
+  "blk.21.ffn_norm.weight": "b28b97e18ab20a5c553ba422f7d7f6014f5902f1d62a69abd20d9fe19a5f9462",
+  "blk.21.ffn_up.weight": "5c39d0ac4d602b8ec8909dade93b2efcd6b6d9d84a19b252d76bb66dcfaab87c",
+  "blk.22.attn_k.weight": "01f26272c82917a87a3ccf922fa1d521a952b05de878241b7efe3525b617ac87",
+  "blk.22.attn_norm.weight": "5ffc96249d8873b506e9eb7158bdfd07fa1429e53c1951430ca7505d25f11c76",
+  "blk.22.attn_output.weight": "9c2201569358f720244b9c9497e4da02585a167b1414c8a506b85ad75ba990d0",
+  "blk.22.attn_q.weight": "906036eb4ddf027f6d920f9356a6a2a5e529b96f4e1231a0496d46b4434a5842",
+  "blk.22.attn_v.weight": "30ede8b0d166003a4b8a81fc99437f557719fc36e5c4dd510c9f161f36a47e73",
+  "blk.22.ffn_down.weight": "d04c164beabab30e1837b843e18852260efccfbb9d96a34ddd816e6fb3ba23c5",
+  "blk.22.ffn_gate.weight": "19c889db6b19179f0a62d5981a1506592c65de83760d67afbe00d202202750a8",
+  "blk.22.ffn_norm.weight": "4885eff2d851b32dbd306bd632c725857e6d164f0fa8b3d5857e572e6ef98ee9",
+  "blk.22.ffn_up.weight": "365594d8db8e95cf87cc33ac23947942dc326110175cc8ec5a07b5c7059089a7",
+  "blk.23.attn_k.weight": "badfea1569da0fc6ab817c5727ca3a69b07d9cfd622fb8be5e66678d5b3f7ae2",
+  "blk.23.attn_norm.weight": "8968f78a379ac3ca5458b4ed4251e8d9112aca6d6dd1ef6440b4bb0b380375a4",
+  "blk.23.attn_output.weight": "93e43393c03956287b1fe31e9735ff1cfe84f4ae56b83dbaebe96275e4e11831",
+  "blk.23.attn_q.weight": "aaff73c725a8700ae66bf26ac8869dfe96738eff23a8ff340de2ab53400a5795",
+  "blk.23.attn_v.weight": "3a86a8dcf14a746ed1411f5a7e634064bc4dfd6511c24cfeccfb2c9ebb6b4101",
+  "blk.23.ffn_down.weight": "d4da6f37bd7ef69bb203f7b0dd59f50bce37432c70627e6cf274ab81548af5cf",
+  "blk.23.ffn_gate.weight": "5b6072936c4a693923bb4e3d1473fd45545cb02fc07799aca458ef0449a04061",
+  "blk.23.ffn_norm.weight": "cd76e37025f84773180298ddb15e0d4ba9cfc7d832e19c791049daa47c6d9c10",
+  "blk.23.ffn_up.weight": "cde43b99b83124a13b2e4753d12674b3a61dfb34c04703007ced3e8e2aee1801",
+  "blk.24.attn_k.weight": "457379edc4cce4cbbe107385079019bc922264fdfc7bd1d1ae84343a81460c66",
+  "blk.24.attn_norm.weight": "0ce0dfab2edeede5da419fa7833db78e36222cf25c358d08f3ec664310f031fb",
+  "blk.24.attn_output.weight": "0cf91c2fd40c204d2fd4b9c85b69281e5ad4ea8442972fcd44b5fc8e835ffdf8",
+  "blk.24.attn_q.weight": "87ede30c09eafec6a4e6285674c1bc4637140b168b2da4ed34f36fdb6e176cc9",
+  "blk.24.attn_v.weight": "4c0b078b2798ca35d6d2c2258fe499820d2bc88700654ba4016e4b028f563590",
+  "blk.24.ffn_down.weight": "cdb8540c32b1ab988f984484928d39f6841f2131c1cebe90ad9456737fccbcaf",
+  "blk.24.ffn_gate.weight": "da2e0e913648b5526bd2bbb344038dd067639343aed3b413662b064b0db7556e",
+  "blk.24.ffn_norm.weight": "8940bd781c610d75eb2be63cfc8d869a3af05e53c963dc7fd4c6f653df5a80ab",
+  "blk.24.ffn_up.weight": "90cbac2a58801abe11ed6c24560aa4acb949f79429f2aa8ff129ac05868bb87d",
+  "blk.25.attn_k.weight": "90607131e36998e990ce718ad05cbecd1bcaed010931401ce6baa3b0d93ebce6",
+  "blk.25.attn_norm.weight": "fbf679c85656c04a6cf8fedd5412c1ace22960e6c2d47f2d43997827811fbb97",
+  "blk.25.attn_output.weight": "08412724ee7a2086514406e6f68fb9f622e10bac25b0c373b294709f4b09bd2b",
+  "blk.25.attn_q.weight": "9c1238e98a2747654a0d4371d3e7ea8b979867f609dc42482544f25591e85c7f",
+  "blk.25.attn_v.weight": "a57796a535c6cb09581cbafd6a91dc14adc8cca2a2465a7ffd0aec546cd84074",
+  "blk.25.ffn_down.weight": "f7e34e8a6391b480da08b52640613ccadce268373934b409759743a1735b74d6",
+  "blk.25.ffn_gate.weight": "b8d0b2f4612678b5ce42bd4a683f8024514b75fb5ebf6b22c600811e95582ee4",
+  "blk.25.ffn_norm.weight": "cde1fdba2369d315f3c6940a997c471ec891924e642505db580d732763bd7b75",
+  "blk.25.ffn_up.weight": "72e700c32ac8b9c47559c2222e45888a480b527ea512075423c5dc01678e2bb3",
+  "blk.26.attn_k.weight": "6ac83b3414ae75bf3a9055c32e49d2c40fe611ab21f8444f03d2f465d18122c9",
+  "blk.26.attn_norm.weight": "55f9d6dc9d75973dc75136ecb9d991b4398097ac133070873fb96ec76a6f60bc",
+  "blk.26.attn_output.weight": "ebc4fcbd15b33263e50ed2ad45740867cce15bc90e1216623babcb1820734509",
+  "blk.26.attn_q.weight": "080f057521073e412936fe3fee64fd574c8128fa4a148b879d3e598fe4954581",
+  "blk.26.attn_v.weight": "0fa2830d6746487ac91b243716e4302361f891e4e008eddd14abec47c7809d5e",
+  "blk.26.ffn_down.weight": "cb2ab8af1653adc57111ada49d2825c6995e338c8208455b92de10e580f60f31",
+  "blk.26.ffn_gate.weight": "231ce30966086bce2dc0e0afd34a22a1958cfda7a57c41b3b8e9444c5dfde8a6",
+  "blk.26.ffn_norm.weight": "35d959d25d17b00617590f5d5831bf705c385c51e46297a14375a700effca6af",
+  "blk.26.ffn_up.weight": "367680c8d332538b467d1ef87cfeb36cc5c6af564c5023c5fb50e728e3438287",
+  "blk.27.attn_k.weight": "0bfcb351c6d17aeac5b55a915074fbdf00f11c4bda98babb196ac8804805746b",
+  "blk.27.attn_norm.weight": "5d598a88c2e75ba59dd7ba4fee940bdec92d72038f1286536d2dfb71d008a09c",
+  "blk.27.attn_output.weight": "23a9da7347336479f6a10ded14cb3f46e06b5bd56dc4b0fbc526c688552ec840",
+  "blk.27.attn_q.weight": "b83319dba9055f069208e9c9d66da08bc6874f23e575288fcd81697d1777aa54",
+  "blk.27.attn_v.weight": "36ed34ccb2f36fdf16b2c2dd225a98ea6b7b0e376e7791191136ccd7bd7a4add",
+  "blk.27.ffn_down.weight": "5488e1d3a58c71b5e9ddda430540b4776b268cfe1457cbc1c2622dedd9e4526e",
+  "blk.27.ffn_gate.weight": "4ff48011ee0bac39af704849d9132a2410392c87a509c684f2062f6b76b498fb",
+  "blk.27.ffn_norm.weight": "32afe99675983da3de2961d1b5ca41c98970a356823597fe29e91f6e86abf0e8",
+  "blk.27.ffn_up.weight": "1eae3088a75629571fdbf6a20f141bc2bb2ed3f5ba2b9fd1d949f80695e442a1",
+  "blk.28.attn_k.weight": "c4e80af714962d6f9040d2c09f316f4a1cbc3a2e994e19902d7c653cf3c73dba",
+  "blk.28.attn_norm.weight": "c1ecf85dedc1c83d5d402bb7c94fb8b9c11f1a3e5f64e7680f80912d4a560794",
+  "blk.28.attn_output.weight": "72ba47c061b21f5ebc5213a455eaf6fc49c8f8e04ff9ce37e6ed4921b629161d",
+  "blk.28.attn_q.weight": "c4abc47234307f44b8ca789aa6668e298158fa4b459b2c1e84bd581806591cc1",
+  "blk.28.attn_v.weight": "aeba950799d4950e491ad0fcbe30334e39b8975177990a2cb339031c45ac153c",
+  "blk.28.ffn_down.weight": "4e84ce382a37b994fb8608df451a60040559e3f4f3241c3b3cb8989a3ed50d83",
+  "blk.28.ffn_gate.weight": "04df157acdc8e8534ad60acc2d2a4dd3a7a6610f6382535ec728994fa6f83f83",
+  "blk.28.ffn_norm.weight": "4d0386dae2bd1c1a9d0f9730718333e3a486c3bc6a5c5d482193c75d39832c80",
+  "blk.28.ffn_up.weight": "fec60bb0a3daf182a14bd8311fe6dd1e3fd020c5fc273e2549cdb1a2d6b79b05",
+  "blk.29.attn_k.weight": "b0532a263aa5a4e2a7a80adc83fc5dec974493bd18da7f953e7ebfc3f3a19aae",
+  "blk.29.attn_norm.weight": "593fc3b4000c35b7a59dace09ca1756c08be0105b2edd354a0e1c16c82898859",
+  "blk.29.attn_output.weight": "315b896f9f0cbacd0ca8937384c3a3a227efa908cb8c3a9125ec00c480e32b9b",
+  "blk.29.attn_q.weight": "d482d45386d4ad3394f08e9dff233ee3a70d0427d65c0b8fa05905da7e25ca53",
+  "blk.29.attn_v.weight": "cd3b5a6e2852da796902930a6a84bc87fc6a7c7bf51f8fc23758d12a39013b36",
+  "blk.29.ffn_down.weight": "5b3dba6f9753bd1b1ebcba65ef5373dd62c38e755c44b7231b95d93d45761f89",
+  "blk.29.ffn_gate.weight": "8610d9d2db15c256243ffcca3ffd31786d0ada0af0e7c7aa3fd20524370ab036",
+  "blk.29.ffn_norm.weight": "1a2ef2d38b7ac3e51190b9ccb8b6552ba83ab290e523356a7f851ddb35dedca2",
+  "blk.29.ffn_up.weight": "a5fdd15811bde16dc27677cf1a4c97daab4c28cb12a9530f1a0e573134fdb69c",
+  "blk.30.attn_k.weight": "1efeb0b5f4b45a85cdf47300f892ac77ac1f38000ec3653565d1303d1fb8c743",
+  "blk.30.attn_norm.weight": "c73934c182c7fe80838ec1d0b92f50a583f75f7a3d78d822f009b58ad2c80e65",
+  "blk.30.attn_output.weight": "3a0fd89de2d274614750345d827a9c886a4f97b343a13cdf680390505df596a3",
+  "blk.30.attn_q.weight": "711e113362bdb067db843c66236704eb1cd3fc5f40e3767143e96d510686ef4e",
+  "blk.30.attn_v.weight": "82b12a9a74fd3d91b73cc2e841e2b3f0a5197ccd2998afa17020995f880d2267",
+  "blk.30.ffn_down.weight": "af9f4b1287c0d824ae22d6e335d19e04a70135b835be7caa2435f1d85e931993",
+  "blk.30.ffn_gate.weight": "e2ab3e6f15f5c50fca66c084cb6a57a2b6b82406d65150e82ea0437b93dd9a46",
+  "blk.30.ffn_norm.weight": "c1b9c325c83f00e177386a4d7e769945f2995e60950c4a576c0a2c4ab9703d04",
+  "blk.30.ffn_up.weight": "9b94a21efd419715d82071b490d3b635cf1e8da080620dcc39e5bde976d7e9a6",
+  "blk.31.attn_k.weight": "0db0d82e3ddcc2c06209f5f013e1d72a84a996c40bf00186be485b909cc268e8",
+  "blk.31.attn_norm.weight": "2b8b7239471f57140c5cdfe06bd224a4f6326282f99736e44fba4c7b120ac101",
+  "blk.31.attn_output.weight": "a310b048840cc3ff2be4b84796340e8e2cdf05ec89d14bd3655c109b2bfa9fcd",
+  "blk.31.attn_q.weight": "f45e0cd95645175ea82813455356d171838539bc3f7676d877c698f2af0a0eda",
+  "blk.31.attn_v.weight": "8bde008e809112aa7e7c23e9c3099087bcc557313b01306c87efa0a4a30805ba",
+  "blk.31.ffn_down.weight": "8266fec7e203fbfad7033120861e44984581ff8b6851d01dfb7b81c5d8fa90ec",
+  "blk.31.ffn_gate.weight": "b73bc0aa5baf006d9ef6403104891b8133671b0992398fe038380b67e0d7e2cf",
+  "blk.31.ffn_norm.weight": "9c62cc27a7b6017c1df8ad49bff249a8245e8895c6754f402cd44623fda83268",
+  "blk.31.ffn_up.weight": "5b970a4694ea3171a0167f6e1636d9f00268bc1c9640430ffc35218494884adb",
+  "output.weight": "74fa0ef08c57a30e633e7117b1e9c805f833e2e5e21434bc79ddf9c92c6d7330",
+  "output_norm.weight": "59b8a59fd3fbf39353506116e43e5e76edd0cbf2a2873d869da4cf27a04997c3"
+}
diff --git a/convert/testdata/Mixtral-8x7B-Instruct-v0.1.json b/convert/testdata/Mixtral-8x7B-Instruct-v0.1.json
new file mode 100644
index 00000000..0967ef42
--- /dev/null
+++ b/convert/testdata/Mixtral-8x7B-Instruct-v0.1.json
@@ -0,0 +1 @@
+{}
diff --git a/convert/testdata/gemma-2b-it.json b/convert/testdata/gemma-2b-it.json
new file mode 100644
index 00000000..0482f1e1
--- /dev/null
+++ b/convert/testdata/gemma-2b-it.json
@@ -0,0 +1,188 @@
+{
+  "general.architecture": "gemma",
+  "general.file_type": "1",
+  "general.quantization_version": "2",
+  "gemma.block_count": "18",
+  "gemma.context_length": "8192",
+  "gemma.embedding_length": "2048",
+  "gemma.feed_forward_length": "16384",
+  "gemma.attention.head_count": "8",
+  "gemma.attention.head_count_kv": "1",
+  "gemma.attention.key_length": "256",
+  "gemma.attention.value_length": "256",
+  "gemma.attention.layer_norm_rms_epsilon": "1e-06",
+  "tokenizer.ggml.model": "llama",
+  "tokenizer.ggml.add_bos_token": "true",
+  "tokenizer.ggml.add_eos_token": "false",
+  "tokenizer.ggml.bos_token_id": "2",
+  "tokenizer.ggml.eos_token_id": "1",
+  "tokenizer.ggml.padding_token_id": "0",
+  "tokenizer.ggml.unknown_token_id": "3",
+  "tokenizer.ggml.scores": "0872465d173867d755d3ee728f882b9dc2057a0bfd596fe1e3d131522f1250d8",
+  "tokenizer.ggml.token_type": "485e40bf3d715a4764818fc097d6a2a41db872d82ee714bc500872a3437ff48d",
+  "tokenizer.ggml.tokens": "c6e66de1841f04de8b8d236d461ab720a4c9b9b5414dc293a09c6e10eab45fda",
+  "token_embd.weight": "17b87ab2c01c80657855a5413d0457b4a041afaeda0cc785080e44e2f04acf07",
+  "blk.0.attn_k.weight": "28ac0da05754ad2714ae95da28a5ad191192140b30b8fd22d108d4700c9d989f",
+  "blk.0.attn_norm.weight": "3f9d5675d1ab0eb8a816719dac9fab81f2e95c52be02c34263339acbc087febb",
+  "blk.0.attn_output.weight": "703295c2c63990ff896778685c678f145298886f680f3ed5dc2a7ad54c293265",
+  "blk.0.attn_q.weight": "69c2d0e4870e9d722a190d356203c9605575a16863466c3d1747966ef1cf5791",
+  "blk.0.attn_v.weight": "95219c9c07b5ffe9a9a01e456d845eef2b11f4fc12c93dbbba479db395444c13",
+  "blk.0.ffn_down.weight": "a2feb5eb3d572c57c5bafbf0ab506862df1160fe40965dcfe4b9fd855c08bed7",
+  "blk.0.ffn_gate.weight": "fcca072c445c31f4dc4d5dfaa785b1bdf7271342442099b74fd17268b5829fbf",
+  "blk.0.ffn_norm.weight": "7621f95dbd245cade6fffd6b08797d69d8e3954e960f0b5551b90d967ab95448",
+  "blk.0.ffn_up.weight": "14a9bcdd451403c67136391e1b6e53b3b1830f00199bd911dbcc56d8749c14f4",
+  "blk.1.attn_k.weight": "c70f73c5df20579cb44d971164b48b5f0d8d5abdb38b381e7a8b880ba12aa406",
+  "blk.1.attn_norm.weight": "88b6b91f93a1ef83425a7c7dc2a2fbd3b22704a04c64a80061df376ac8c33626",
+  "blk.1.attn_output.weight": "f031a537490c452be3b3bb51e6b7949a636405756e160976a1c070a792ea00ee",
+  "blk.1.attn_q.weight": "bdb23214b1cf9cfd30f863a0a5868e52c6809d93b7e8f44df096a94204d9896a",
+  "blk.1.attn_v.weight": "e9bbc0b05f2c872fb1403f8f938cd1612b502229ee401f12593b1164c61acc00",
+  "blk.1.ffn_down.weight": "5ff53811038b661a7b8f2bfdf213bebfb185ec1a6060b662f063714f33584d79",
+  "blk.1.ffn_gate.weight": "205085c8c951a5c7543b1495183cd96028fb49f67464b3e9862a2693a6077a33",
+  "blk.1.ffn_norm.weight": "798f354fc85afce9625f5d10093a585a966831698a0560e6c9b97ce659eb4b22",
+  "blk.1.ffn_up.weight": "db92dc5684cb6e90940e13f4d1da555ed20ba4f8cab1e990ddfd7553e2e91315",
+  "blk.2.attn_k.weight": "ef5ce360c4eed6d00d03ca4761e0f8e4b0af4509978468314be14f3d46621044",
+  "blk.2.attn_norm.weight": "6dadbc05dbd0d3fabb4216affa60a3de1378a82d2859dc90b338cbe70f50d455",
+  "blk.2.attn_output.weight": "6bbf87a966f691bbfd7c8d25629aa4e6710107bd431a667434861febb391edc5",
+  "blk.2.attn_q.weight": "4e575c09ae2de417ce9057ce8b073680e860a24aae13a472b68f101b760752e5",
+  "blk.2.attn_v.weight": "cd33f7f01141e9439afdaf2ea1aaced9feaa335e32a58daa136ebd555d4d96f4",
+  "blk.2.ffn_down.weight": "b970ff1b0b6494165defe2fbfa1d31425766ed71e64de9ec4e66ac3955c8bc5f",
+  "blk.2.ffn_gate.weight": "dbb3e1360402e0e369b101995bb686b73f95d4a7673f061be85d64d15dfb0061",
+  "blk.2.ffn_norm.weight": "bfb7980105d8ac9647710454f57a5cdac50598a0f6f4884e16f1d94b00844687",
+  "blk.2.ffn_up.weight": "50ef89339b275a438b664686f6227dd9b6e43853ed6856ec9e33ef4bbd90bda1",
+  "blk.3.attn_k.weight": "be942ea98151434eebcd2c1da4b00e0146152fe524a530689b1fd491cb833d21",
+  "blk.3.attn_norm.weight": "0df2f218daf609c289fb7c60c5f375fa99c0d4e04381ad5a494a19144edd8e20",
+  "blk.3.attn_output.weight": "c2184aaf86aa2cb8f47be49f60b165834e97205f39c6ee1dfd19fd4411a156ce",
+  "blk.3.attn_q.weight": "4f86e2a0a4221c1c84ff9c409ac89893cb95d7208cf65bf1e98e24e01125f991",
+  "blk.3.attn_v.weight": "abfdb8a60c349dadde641d1afc9542025e24fbf41a3238bfa9675e0b1f1e4b68",
+  "blk.3.ffn_down.weight": "58821a8d87008d47d122427911c6fad5272aca70c448bbae223256a74bacd07e",
+  "blk.3.ffn_gate.weight": "776e051f1a0ddd5c4934e69186683a75ca9a3c8c0f61911bba321fed1dd287d2",
+  "blk.3.ffn_norm.weight": "7f380f29335e28be90bfcfae6f6d69fdf5751211b36d2dd62aa5541ed113e4f2",
+  "blk.3.ffn_up.weight": "fc5ae8d488894cbd4951059675468d227da27871d26e925c9941863841c097ee",
+  "blk.4.attn_k.weight": "14833b078cc4c5137bdd5fdc0538047974ca147a99b0282e1b144440c78bc1db",
+  "blk.4.attn_norm.weight": "0a69957d4a15599fb80ad4753558020804925221457d9a5052926754d3768065",
+  "blk.4.attn_output.weight": "887a49b6130fb6297cf10767207c3dd97191b2cf63723449af9c27bca8dbeda0",
+  "blk.4.attn_q.weight": "51fd577b76764824dd6f0d4891c137ebe4736f591b5ca2793c5fff2be49abbde",
+  "blk.4.attn_v.weight": "1a623c43cf9c509d1b7ea0d1a5c04d0af4809665f9f9e93b7d6dba8c5df178fa",
+  "blk.4.ffn_down.weight": "5d61e8856d8941d2b1fd138116d015f63840d0fa1e31e20e20a5ceca1536ceec",
+  "blk.4.ffn_gate.weight": "06640f7273764f8ca5df7e386547417916b6cd7d565a8343153113239a94b0a1",
+  "blk.4.ffn_norm.weight": "91a6c6c41b894228e361435ecbc5058dca34d4911a23da5b56de219299c964d3",
+  "blk.4.ffn_up.weight": "d016dac1055e36d6a10b6317e57f98a904709ea892ef3194342f4d2f6326561e",
+  "blk.5.attn_k.weight": "987146afe124131500808cc0da33c06d207433656d41df6e6d8c99118a83bac5",
+  "blk.5.attn_norm.weight": "6b354938966f2608a2fb8d0f5b363ed0d8b0967c2ec8d0abd5c625b413042ded",
+  "blk.5.attn_output.weight": "cdcbfe02c6ff79d5326882b017a02099f5af71beedf6b1b3eb4de01e3a844536",
+  "blk.5.attn_q.weight": "b910d0cff781d3efb42eab0a302f46f286b2de717079175680d5b42bf8c309c8",
+  "blk.5.attn_v.weight": "66d3a279f747412f9f4b0e8abad44540c122ab2e811a7ee74c1f33bc36caade9",
+  "blk.5.ffn_down.weight": "c9b0efd2212981f16d956d8571f054b68780ad01f4917033647e359b557a4653",
+  "blk.5.ffn_gate.weight": "fe96b94109ca141c01f6a04788e20783019ca6ec334aa1f3134810bdb499e557",
+  "blk.5.ffn_norm.weight": "aa7b016e832e7055a36c6e20de58ea1936f995f390401fff1c5fc65906064e49",
+  "blk.5.ffn_up.weight": "555ce27c4873d3375394f38ad3b45e3d8848f9d5642dc1602383d0f0a33c2a14",
+  "blk.6.attn_k.weight": "88280d461db324c4f36475ce396793063e61a27283ec64511b0480890fb5b3b4",
+  "blk.6.attn_norm.weight": "af8f460c411f660d33196286d208f1845fd5a2b45f7b56549a4df31e7515447a",
+  "blk.6.attn_output.weight": "dd9996fb0a256e8375ad3917705258a33fce006bcea0f536caae420a77974d8b",
+  "blk.6.attn_q.weight": "7a4841541191e037cfb9b07930c4d8cab451809658b182f0ada6ccde9615c003",
+  "blk.6.attn_v.weight": "ae81e6a592b64d701a9d40233e986039a56cba8d8d24f61aea93c6393cf3078a",
+  "blk.6.ffn_down.weight": "622dd1ce1706355cbc659a8ab2c4509678ffe0f3ad34258e5e25ed2a5d951bcd",
+  "blk.6.ffn_gate.weight": "8389a735c0bd5591010f8ced9805a2a12c749f6df0d3c18ad4d05c2a302e7168",
+  "blk.6.ffn_norm.weight": "621f5346400382474d61358397bd58fb1459b07c53e376e4bca15e08b3f9b3fb",
+  "blk.6.ffn_up.weight": "8d834e4c42f13c251dfee36cf89e12f1bd400680d00d5c2e6cac0459e9ce2f7f",
+  "blk.7.attn_k.weight": "8bd0412de65a3e64901ef8fe6a28c95e116bf39dc9aa22f0126b9d36688e5ea7",
+  "blk.7.attn_norm.weight": "056d8e56be4e87d6dc6f900762f0dc6fde07bfdc50dd85bfc510415e2bba3f3d",
+  "blk.7.attn_output.weight": "27972eda51da53d416ff95aed78149a2c5a287b47d2cd46f2f544ca692ecb3bb",
+  "blk.7.attn_q.weight": "41eca977b9371f7932800c11a9c45b931310196919e2a0651b847703b180fc7f",
+  "blk.7.attn_v.weight": "13c74fd7e07f08883a09fb070a1fe5bbdd2341b4cb8d1cac07c4b637049b5774",
+  "blk.7.ffn_down.weight": "9e75db42468800849a9a7da603d0072c5e86c8ed2b4d8b20a312a51fb86a7a10",
+  "blk.7.ffn_gate.weight": "db6bdc3117f910088aaf7db51f2da63ea5bd933de36af5599c215bfb26f7db2b",
+  "blk.7.ffn_norm.weight": "48bb82b49bfc8679a1e77f282ee182d952db7a3c11be7ef9a102ee2ddd8011e2",
+  "blk.7.ffn_up.weight": "feebea87175817a0f3585ec0af09dc873d94c203581ae97a712eb356d3b49efe",
+  "blk.8.attn_k.weight": "d5640ad71b6af68d88e17bf8e7fc26c907d2262605457a84247dd9afc2884d69",
+  "blk.8.attn_norm.weight": "75b850c481a69083ae09d0207ba7317b37c735a39fcf5fef5400e6c84fb1257f",
+  "blk.8.attn_output.weight": "cbd669dbdea2bdd90f9f0cc97566b3dffff3c56cecb4f47290ceef30da83b2d6",
+  "blk.8.attn_q.weight": "9edcb63087a431bac361822497e6ecdaa06d9ea4a1a754e36da7ba9f8db81c7c",
+  "blk.8.attn_v.weight": "3fb72c2c4f95a83626aa3e30062f9450b09ab37c7871e229f18bbc5cf744633c",
+  "blk.8.ffn_down.weight": "bd69d2c9172974fff154441b237b4787fb53b2d185325442d5048130ef5bc4ef",
+  "blk.8.ffn_gate.weight": "d04689c80553edd011d1cbaa5d570fffa7fa91e88b66cf1352d89ab60b72f908",
+  "blk.8.ffn_norm.weight": "e49984183b735b7f2c4e4730c289eed9394056d2e283a00fd83ea0915df31a73",
+  "blk.8.ffn_up.weight": "8fe62a1ce8e847e567add6c6f6bf2922bc467495b5eb4c116b3cb85b85b3b211",
+  "blk.9.attn_k.weight": "d90904959e5004cf0d6e729c6bff18cc33c094798b802473c1ec55ab8d276183",
+  "blk.9.attn_norm.weight": "79277f290cc07411115d8fa138045edf4a17b3416ab2145409cbe8ab829fd4ee",
+  "blk.9.attn_output.weight": "5a21bf2e1f09a81405025f96d4153ffb630158e17269cff8ffff935c38ceb1a7",
+  "blk.9.attn_q.weight": "51b1d0febc3b350945be4504f55afa4347517bde0f710e1a4b88e6b17e71e7c7",
+  "blk.9.attn_v.weight": "aab7e1db0a8b50a03036356791ffce736ab010d15674c96eaef8049d80076054",
+  "blk.9.ffn_down.weight": "cbf43ec84becb40c9359a181ab0e641fd7faae7d34b549501f7cfb7afdc3d764",
+  "blk.9.ffn_gate.weight": "dce0e8661c778327bed7f03b6790d26710764188aed9dc746e6e05863891fa57",
+  "blk.9.ffn_norm.weight": "6d41642104f995c77bf31122b13237caebda3e7fcccb1367ce91db36b015e923",
+  "blk.9.ffn_up.weight": "82fe4c67bf24e7b2d6f6e05f7b1234c2bf90c3932951091a9066211b8e15ecbb",
+  "blk.10.attn_k.weight": "f6a9ed8fd8d3229b5d03175c413ffc56a07f2ce7236271986361dd3d8993f9aa",
+  "blk.10.attn_norm.weight": "cebbef89f0326ca8e02df3867a571e4d61c20c2a12f295f98ae590d62bc86010",
+  "blk.10.attn_output.weight": "34f5efb86accb4f06347d83a32558ea8eab3039d128969161a741ebacbb656ff",
+  "blk.10.attn_q.weight": "1e0efe27df2d5d50f7157253ba2cfd436d6781c3dc78ca176d0c16a210b5b763",
+  "blk.10.attn_v.weight": "8f085bf50a2b0f83cd6cdda3c8ef5a9e204a36348ed95871aac725d1f68640cf",
+  "blk.10.ffn_down.weight": "bf3b3cb4cace435809ac7b4cc933f20853af12f1f272d3dcefe7f19c0f203b8b",
+  "blk.10.ffn_gate.weight": "d3df7a1413b1c5adf1a1dcda9e5225a15c89874bae53bb6137ad1ea42fca2d34",
+  "blk.10.ffn_norm.weight": "a1da603b0480471b5ed8e862148cecd5fed918f8304d6933ab0bdb25b8d2fb8f",
+  "blk.10.ffn_up.weight": "bffbba605922e972dc47dda88a0b4659aa52236c76e5fe861a949e6d9a367492",
+  "blk.11.attn_k.weight": "9f31c63d66cd32c29b1eb8bb829d0c8525ce2ae936e0eefdaab6335a2d12a3df",
+  "blk.11.attn_norm.weight": "0bde1a266d8b2e8f202bb7e2e88b19147ca83021901f6d3cae77a4df5548c754",
+  "blk.11.attn_output.weight": "e10725c7cf746ed4a7e472cf7aea6cb564e5db6a1d5197adc980d650a387ccea",
+  "blk.11.attn_q.weight": "05ee758a7d065802630f8c65dca424364c1c8825e389aa33f9405c45e8a50cce",
+  "blk.11.attn_v.weight": "0c3ae7090f11775d24c51120db6e305db6aff706493e7ee123dcab74485ba789",
+  "blk.11.ffn_down.weight": "7ba40b8e12c09c5fb2006b77a771cb01ce894e88a3b3e1877f927a5b89c91709",
+  "blk.11.ffn_gate.weight": "db76388a023b98097972d354ba1c6a5e26efdeb1c596b9c28bf2cd8f6596975e",
+  "blk.11.ffn_norm.weight": "a38c3ae1b89a68ddc7b72c99c5b28be7fe3787c4fad9904d0c43d64eaf00c474",
+  "blk.11.ffn_up.weight": "13c8142f9cf1eddc658babf978daf3515c4ccc45f849f3e7e3930aa18a8480a0",
+  "blk.12.attn_k.weight": "f03241c36ac87cb57429a2ef22186b8d7d0b590a8b173beb01fa13d93772f3b1",
+  "blk.12.attn_norm.weight": "4568f654e6d65104d586e7c16ba960c83428698ce103022b7e0be15e2884e13b",
+  "blk.12.attn_output.weight": "04867603f82f91e41306e09b33ecda0104b3ee4834061f2c0bbdc8da33c72509",
+  "blk.12.attn_q.weight": "70fe04b9a8e08b6100cc8d6b58bf4cbbad15ca1de82d63baca5d352ba6c4cbae",
+  "blk.12.attn_v.weight": "15cb28db61a86c98687991d7e611bc92a1fcc6007f3432149cfb5fe518a4f65e",
+  "blk.12.ffn_down.weight": "6d10c790a4e3dc44c2dc36d96251ae97cdf30a4fa04d4c43e31bfbd038e6a7b7",
+  "blk.12.ffn_gate.weight": "3462a2d8f6b4743b25e24da51b90018ac2858d05ac7e582bcb69063cfdac1104",
+  "blk.12.ffn_norm.weight": "1f96392c1faa34e34ae5dea55a6a86c5aa4c79758952075d53d28de89dd88456",
+  "blk.12.ffn_up.weight": "d22eacc612a7411953d948483c5fb201e11722955ee0754da866e7bec578ac6d",
+  "blk.13.attn_k.weight": "5864977e6b733ea942647d6feed5c76156c48c200649c22e4e11b9e5860e57f3",
+  "blk.13.attn_norm.weight": "87e053535144723db4145aa5402acc54331b7696752d852bb9fc542ff33f0fb5",
+  "blk.13.attn_output.weight": "078145f5ad83f8b14f97a869346f7fd1583b24d1e3edadaa95d3da4242973f8f",
+  "blk.13.attn_q.weight": "3b8caf35504cbc4d1a7dd6e011a95760703b7f71e2218b030b1254f811362dd7",
+  "blk.13.attn_v.weight": "4fdf8365a603e043e5b40c4a21c84ac167f9be62794178f9d8a608dfe5653bf9",
+  "blk.13.ffn_down.weight": "a07d3abbfcacf48ba028df2cab895be32cc15022d23389a745286e79c1b1d1fd",
+  "blk.13.ffn_gate.weight": "1d2ab39666aa2909acc96787432a3ed13b19d25170f74665fadff9b17bbaffb1",
+  "blk.13.ffn_norm.weight": "4f2e809fda5f3eadf52578ee50e0ba36e53be91e55dce418c12dfe595f5f18e7",
+  "blk.13.ffn_up.weight": "8783d2720c2c37ca176a5801e0b3ef1f9cc9cf3ef1cd37af423aaf6b2a27e2bd",
+  "blk.14.attn_k.weight": "ce9428e2b55d43ae0c6690dbd56182f99adc427694ba8236b405cc8ea5035e86",
+  "blk.14.attn_norm.weight": "6abb35f9db8251d6ae954bda147c6ada2371b0574d11702e828f3c6ac99b7cc0",
+  "blk.14.attn_output.weight": "fe3880916d0ceb5bff672c88bbefb7060a545be609bf049beb2024b38221836d",
+  "blk.14.attn_q.weight": "7c8ad81be6f4a350931fd108b5f7c9e366e8c26ef62d1d85ffef5dca8fd893f8",
+  "blk.14.attn_v.weight": "e4bdedffacbebe38567a0734dfd67db90e911d9a9669fcde9a7c4ad8a0066c52",
+  "blk.14.ffn_down.weight": "ef6694dff1e05820aac0cd2b22f39ac7788b4967afc9250775575554c66aab2c",
+  "blk.14.ffn_gate.weight": "db63c4179e2db704bc505e2b4696e055b593e295a1b7c4c586fc793bdd5aab19",
+  "blk.14.ffn_norm.weight": "2796a62d832a9710148f95d533320492a33e712b2e5218659c548705bd11684d",
+  "blk.14.ffn_up.weight": "3f78c78d8c2d54df45f799d4ff902316628af296834afe4ceed63d4a324ff03e",
+  "blk.15.attn_k.weight": "6e810ee3859e07695645ee0c9a5efc7962668984a5f0a9325f47e462743b447c",
+  "blk.15.attn_norm.weight": "0956b576ae96db0b28cb09f761f801cfd9281432284664f0fe181c8d9c55d1ec",
+  "blk.15.attn_output.weight": "03a17f7e94208177aace5cc41b7f54670ba57873b7274ff6e23caf58cce110ca",
+  "blk.15.attn_q.weight": "b8edafe7d2216a6f8b4ae4905a906475490e6ea418f6e1d3cec563dbdc6fab91",
+  "blk.15.attn_v.weight": "f8ae8cae0f4cfa34a459824eba57350c3c248104ba5607e7d9dc7d7c39aaf4a6",
+  "blk.15.ffn_down.weight": "8d02eb439da852246d2ca67e9b7b6de0b090b80744355e64728a23e41926505b",
+  "blk.15.ffn_gate.weight": "ed5bf361c67db8731f186b775826f21c33bdb521111fd2d922539719a770239f",
+  "blk.15.ffn_norm.weight": "5942ca3c73209ac9a0c8bfd9b4aab7f7be7aee9aa12d9c35833493b44af76767",
+  "blk.15.ffn_up.weight": "f4bebf4ad99ec5f911327dec347be6c595814885309c7bc5647ce28c7f4d1cf5",
+  "blk.16.attn_k.weight": "756a534c19364448e0958b8948fe33891c6ccda0fbb4dfa2024e1f532a87804b",
+  "blk.16.attn_norm.weight": "386b7b9e4e6509f6af9c022d942b6c6c6cc136aeed8751ecb037c74d7c4bfb93",
+  "blk.16.attn_output.weight": "3ba1a766a25830b84d7c22178203635f9c5624caad290bc5e5d73da5d5e7a2ec",
+  "blk.16.attn_q.weight": "d39b0c91e1fda7685d50a0f7cc8d18c44b5bdc90a142c7fda0bc329cca1afa74",
+  "blk.16.attn_v.weight": "98b33fcb0ee3483cff1b06ecb44d7b7ffb4d34c268248e4d73dfdf82b2065b2f",
+  "blk.16.ffn_down.weight": "14006f5e4acb2f9416271ae562e299359cd2585739c7fc77ccbca54495563948",
+  "blk.16.ffn_gate.weight": "12f8abae2d301d8f88bedb6af98b1daecc7b0b8d05148594f931f30958d77aca",
+  "blk.16.ffn_norm.weight": "129a15a046ee96d06de288bd43c80f77a6b0fb3a159c7367154c6e4aaf362672",
+  "blk.16.ffn_up.weight": "b4a5911a45f3871ef1d4efb7dc7108645a564b70f818eccf45beebef2e844ee9",
+  "blk.17.attn_k.weight": "5e1bfcff0146ebdde3817b656952892eb671e14e75afc92fa53f84f8eecbec4c",
+  "blk.17.attn_norm.weight": "60bc988fab7c4b29ee9de599df41a8de00caa94fcd74677da011fac82f60f465",
+  "blk.17.attn_output.weight": "ba49b40d6a0b5685f749c24b0edbed3adc44dbe13b5d5e5fa1e56169fc746555",
+  "blk.17.attn_q.weight": "82bb415d24efcd14d03ace03f907bb70db6a204c76a0bdd1892e0fba165db87d",
+  "blk.17.attn_v.weight": "73dbe54beb91a899884e275ea81ffc5187a20cb7d5b68d5c299b783096999d94",
+  "blk.17.ffn_down.weight": "7c086166241e0664f8963fd1ca4ed74c737abfb2525ec20f8435821ff50158f3",
+  "blk.17.ffn_gate.weight": "51a32f78244d42a539f619c5ce661db9e6cf41636280a826d439b5444edcd28c",
+  "blk.17.ffn_norm.weight": "c4bb247fccd1ecc84875028af63dd20aaf5cbd17eb94a9bc36679c09285dccab",
+  "blk.17.ffn_up.weight": "b5886182790bc6fbadd63de9bc4ffee416f3b69a66280d197ab8c18edf769abf",
+  "output_norm.weight": "481f3097d0a20412e35b3a739b1b958487bcd41ff67744baa3c9acbddd2ee4d4"
+}
diff --git a/llm/ggla.go b/llm/ggla.go
index 34c4f6ca..831f6071 100644
--- a/llm/ggla.go
+++ b/llm/ggla.go
@@ -36,6 +36,8 @@ type ggla struct {
 
 	kv      KV
 	tensors []*Tensor
+
+	tensorOffset uint64
 }
 
 func newGGLA(container *containerGGLA) *ggla {
@@ -50,7 +52,10 @@ func (llm *ggla) KV() KV {
 }
 
 func (llm *ggla) Tensors() Tensors {
-	return llm.tensors
+	return Tensors{
+		Items:  llm.tensors,
+		Offset: llm.tensorOffset,
+	}
 }
 
 func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
@@ -66,6 +71,13 @@ func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
 	}
 	llm.kv["alpha"] = alpha
 
+	offset, err := rs.Seek(0, io.SeekCurrent)
+	if err != nil {
+		return err
+	}
+
+	llm.tensorOffset = uint64(offset)
+
 	for {
 		var dims uint32
 		if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
diff --git a/llm/ggml.go b/llm/ggml.go
index fddb5039..d7f2eef7 100644
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -112,11 +112,14 @@ func (kv KV) ChatTemplate() string {
 	return s
 }
 
-type Tensors []*Tensor
+type Tensors struct {
+	Items  []*Tensor
+	Offset uint64
+}
 
 func (ts Tensors) Layers() map[string]Layer {
 	layers := make(map[string]Layer)
-	for _, t := range ts {
+	for _, t := range ts.Items {
 		parts := strings.Split(t.Name, ".")
 		if parts[0] == "blk" {
 			// join first and second part, e.g. blk.%d
diff --git a/llm/gguf.go b/llm/gguf.go
index a8427aed..aadfc4ba 100644
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -89,6 +89,7 @@ type gguf struct {
 	tensors []*Tensor
 
 	parameters uint64
+	tensorOffset uint64
 
 	scratch [16 << 10]byte
 }
@@ -109,7 +110,10 @@ func (llm *gguf) KV() KV {
 }
 
 func (llm *gguf) Tensors() Tensors {
-	return llm.tensors
+	return Tensors{
+		Items:  llm.tensors,
+		Offset: llm.tensorOffset,
+	}
 }
 
 func (llm *gguf) numTensor() uint64 {
@@ -236,6 +240,14 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
 		alignment = 32
 	}
 
+	offset, err := rs.Seek(0, io.SeekCurrent)
+	if err != nil {
+		return err
+	}
+
+	padding := llm.padding(offset, int64(alignment))
+	llm.tensorOffset = uint64(offset + padding)
+
 	for _, tensor := range llm.tensors {
 		offset, err := rs.Seek(0, io.SeekCurrent)
 		if err != nil {

From 0f3271db8892581f800b6f1c4a795aac4f3127c6 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 31 Jul 2024 14:48:06 -0700
Subject: [PATCH 62/79] patches: phi3 default sliding window attention

---
 llm/patches/11-phi3-sliding-window.diff | 43 +++++++++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100644 llm/patches/11-phi3-sliding-window.diff

diff --git a/llm/patches/11-phi3-sliding-window.diff b/llm/patches/11-phi3-sliding-window.diff
new file mode 100644
index 00000000..fde3dd21
--- /dev/null
+++ b/llm/patches/11-phi3-sliding-window.diff
@@ -0,0 +1,43 @@
+From 6eedae4cf2fcc8015dac79cb3f28f61fcabacab2 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Wed, 31 Jul 2024 14:57:04 -0700
+Subject: [PATCH] phi3 sliding window
+
+---
+ src/llama.cpp | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/src/llama.cpp b/src/llama.cpp
+index a207451f..f2872d4e 100644
+--- a/src/llama.cpp
++++ b/src/llama.cpp
+@@ -4893,7 +4893,7 @@ static void llm_load_hparams(
+             } break;
+         case LLM_ARCH_PHI3:
+             {
+-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
++                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ 
+                 switch (hparams.n_layer) {
+@@ -10762,7 +10762,7 @@ struct llm_build_context {
+         struct ggml_tensor * inp_pos = build_inp_pos();
+ 
+         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+-        struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
++        struct ggml_tensor * KQ_mask = hparams.n_swa > 0 ? build_inp_KQ_mask_swa() : build_inp_KQ_mask();
+ 
+         for (int il = 0; il < n_layer; ++il) {
+             auto residual = inpL;
+@@ -10820,7 +10820,7 @@ struct llm_build_context {
+ 
+                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                         model.layers[il].wo, model.layers[il].bo,
+-                        Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il);
++                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+             }
+ 
+             if (il == n_layer - 1) {
+-- 
+2.45.2
+

From 5e9db9fb0bcefbe599734b02dd030f4a347ce576 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Fri, 31 May 2024 20:00:49 -0700
Subject: [PATCH 63/79] refactor convert

---
 convert/convert.go                            | 243 +++++-------
 convert/convert_gemma.go                      | 103 ++++++
 convert/convert_llama.go                      | 182 +++++++++
 convert/convert_mixtral.go                    |  89 +++++
 convert/convert_test.go                       |  25 +-
 convert/gemma.go                              | 102 -----
 convert/llama.go                              | 159 --------
 convert/mistral.go                            |  84 -----
 convert/mixtral.go                            |  87 -----
 convert/reader.go                             |  74 ++++
 convert/reader_safetensors.go                 | 140 +++++++
 convert/reader_torch.go                       |  46 +++
 convert/safetensors.go                        | 309 ----------------
 .../testdata/Mistral-7B-Instruct-v0.2.json    |   2 +-
 .../testdata/Mixtral-8x7B-Instruct-v0.1.json  | 349 +++++++++++++++++-
 convert/tokenizer.go                          | 265 ++++++++++---
 convert/tokenizer_spm.go                      |  83 +++++
 convert/torch.go                              | 287 --------------
 llm/gguf.go                                   | 326 +++++++---------
 llm/memory_test.go                            |   6 +-
 server/model.go                               |  26 +-
 server/routes_create_test.go                  |   5 +-
 server/routes_generate_test.go                |   8 +-
 server/sched_test.go                          |   8 +-
 24 files changed, 1514 insertions(+), 1494 deletions(-)
 create mode 100644 convert/convert_gemma.go
 create mode 100644 convert/convert_llama.go
 create mode 100644 convert/convert_mixtral.go
 delete mode 100644 convert/gemma.go
 delete mode 100644 convert/llama.go
 delete mode 100644 convert/mistral.go
 delete mode 100644 convert/mixtral.go
 create mode 100644 convert/reader.go
 create mode 100644 convert/reader_safetensors.go
 create mode 100644 convert/reader_torch.go
 delete mode 100644 convert/safetensors.go
 create mode 100644 convert/tokenizer_spm.go
 delete mode 100644 convert/torch.go

diff --git a/convert/convert.go b/convert/convert.go
index 103de457..4ad64d72 100644
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -1,200 +1,123 @@
 package convert
 
 import (
-	"cmp"
-	"encoding/binary"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"io"
 	"log/slog"
 	"os"
 	"path/filepath"
-	"slices"
-	"strings"
 
-	"google.golang.org/protobuf/proto"
-
-	"github.com/ollama/ollama/convert/sentencepiece"
 	"github.com/ollama/ollama/llm"
 )
 
-const (
-	_ int32 = iota
-	tokenTypeNormal
-	tokenTypeUnknown
-	tokenTypeControl
-	tokenTypeUserDefined
-	tokenTypeUnused
-	tokenTypeByte
-)
-
-type Params struct {
-	Architectures     []string `json:"architectures"`
-	VocabSize         int      `json:"vocab_size"`
-	HiddenSize        int      `json:"hidden_size"`       // n_embd
-	HiddenLayers      int      `json:"num_hidden_layers"` // n_layer
-	ContextSize       int      `json:"max_position_embeddings"`
-	IntermediateSize  int      `json:"intermediate_size"`
-	AttentionHeads    int      `json:"num_attention_heads"` // n_head
-	KeyValHeads       int      `json:"num_key_value_heads"`
-	NormEPS           float64  `json:"rms_norm_eps"`
-	BoSTokenID        int      `json:"bos_token_id"`
-	EoSTokenID        int      `json:"eos_token_id"`
-	HeadDimension     int      `json:"head_dim"`
-	PaddingTokenID    int      `json:"pad_token_id"`
-	RopeFrequencyBase float64  `json:"rope_theta"`
-
-	Experts     int `json:"num_local_experts"`
-	ExpertsUsed int `json:"num_experts_per_tok"`
-
-	PreTokenizer string
-
-	ByteOrder
+type Parameters struct {
+	Architectures []string `json:"architectures"`
+	VocabSize     uint32   `json:"vocab_size"`
 }
 
-type ByteOrder interface {
-	binary.ByteOrder
-	binary.AppendByteOrder
+func (Parameters) KV(t *Tokenizer) llm.KV {
+	kv := llm.KV{
+		"general.file_type":            uint32(1),
+		"general.quantization_version": uint32(2),
+		"tokenizer.ggml.pre":           t.Pre,
+		"tokenizer.ggml.model":         t.Vocabulary.Model,
+		"tokenizer.ggml.tokens":        t.Vocabulary.Tokens,
+		"tokenizer.ggml.scores":        t.Vocabulary.Scores,
+		"tokenizer.ggml.token_type":    t.Vocabulary.Types,
+	}
+
+	if t.Template != "" {
+		kv["tokenizer.chat_template"] = t.Template
+	}
+
+	for _, sv := range t.SpecialVocabulary {
+		kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID)
+		kv[fmt.Sprintf("tokenizer.ggml.add_%s_token", sv.Key())] = sv.AddToken
+	}
+
+	return kv
 }
 
-type ModelArch interface {
-	GetTensors() error
-	LoadVocab() error
-	WriteGGUF(io.WriteSeeker) error
+func (Parameters) specialTypes() []string {
+	return []string{
+		"bos", "eos", "unk", "sep", "pad", "cls", "mask",
+	}
 }
 
-type ModelFormat interface {
-	GetLayerName(string) (string, error)
-	GetTensors(string, *Params) ([]llm.Tensor, error)
-	GetParams(string) (*Params, error)
-	GetModelArch(string, string, *Params) (ModelArch, error)
+func (Parameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []*llm.Tensor) error {
+	return llm.WriteGGUF(ws, kv, ts)
 }
 
-type ModelData struct {
-	Path    string
-	Name    string
-	Params  *Params
-	Vocab   *Vocab
-	Tensors []llm.Tensor
-	Format  ModelFormat
+type Converter interface {
+	// KV maps parameters to LLM key-values
+	KV(*Tokenizer) llm.KV
+	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
+	Tensors([]Tensor) []*llm.Tensor
+
+	// tensorName returns the LLM tensor name for a specific input name
+	tensorName(string) string
+	// specialTypes returns any special token types the model uses
+	specialTypes() []string
+	writeFile(io.WriteSeeker, llm.KV, []*llm.Tensor) error
 }
 
-func GetModelFormat(dirname string) (ModelFormat, error) {
-	files, err := filepath.Glob(filepath.Join(dirname, "*"))
+func Convert(d string, ws io.WriteSeeker) error {
+	f, err := os.Open(filepath.Join(d, "config.json"))
 	if err != nil {
-		return nil, err
+		return err
+	}
+	defer f.Close()
+
+	var p Parameters
+	if err := json.NewDecoder(f).Decode(&p); err != nil {
+		return err
 	}
 
-	for _, fn := range files {
-		if strings.HasSuffix(fn, ".safetensors") {
-			return &SafetensorFormat{}, nil
-		} else if strings.HasSuffix(fn, ".bin") || strings.HasSuffix(fn, ".pth") {
-			slog.Debug("model is torch")
-			return &TorchFormat{}, nil
-		}
+	if len(p.Architectures) < 1 {
+		return errors.New("unknown architecture")
 	}
 
-	return nil, fmt.Errorf("couldn't determine model format")
-}
+	var c Converter
+	switch p.Architectures[0] {
+	case "LlamaForCausalLM", "MistralForCausalLM":
+		c = &llama{}
+	case "MixtralForCausalLM":
+		c = &mixtral{}
+	case "GemmaForCausalLM":
+		c = &gemma{}
+	default:
+		return errors.New("unsupported architecture")
+	}
 
-// Details on gguf's tokenizer can be found at:
-// https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#tokenizer
-type Vocab struct {
-	Tokens []string
-	Scores []float32
-	Types  []int32
-	Merges []string
-}
-
-func LoadSentencePieceTokens(dirpath string, params *Params) (*Vocab, error) {
-	slog.Info(fmt.Sprintf("reading vocab from %s", filepath.Join(dirpath, "tokenizer.model")))
-	in, err := os.ReadFile(filepath.Join(dirpath, "tokenizer.model"))
+	bts, err := os.ReadFile(filepath.Join(d, "config.json"))
 	if err != nil {
-		return nil, err
+		return err
 	}
 
-	// To regenerate sentencepiece from the protobufs use:
-	// protoc -I=./ --go_out=./ sentencepiece_model.proto
-	modelProto := &sentencepiece.ModelProto{}
-	if err := proto.Unmarshal(in, modelProto); err != nil {
-		return nil, err
+	if err := json.Unmarshal(bts, c); err != nil {
+		return err
 	}
 
-	v := &Vocab{
-		Tokens: make([]string, 0),
-		Scores: make([]float32, 0),
-		Types:  make([]int32, 0),
+	t, err := parseTokenizer(d, c.specialTypes())
+	if err != nil {
+		return err
 	}
 
-	pieces := modelProto.GetPieces()
-	for _, p := range pieces {
-		v.Tokens = append(v.Tokens, p.GetPiece())
-		v.Scores = append(v.Scores, p.GetScore())
-		t := p.GetType()
-		switch t {
-		case sentencepiece.ModelProto_SentencePiece_UNKNOWN:
-		case sentencepiece.ModelProto_SentencePiece_CONTROL:
-		case sentencepiece.ModelProto_SentencePiece_UNUSED:
-		case sentencepiece.ModelProto_SentencePiece_BYTE:
-		default:
-			t = sentencepiece.ModelProto_SentencePiece_NORMAL
-		}
-		v.Types = append(v.Types, int32(t))
-	}
-
-	slog.Info(fmt.Sprintf("vocab size: %d", len(v.Tokens)))
-
-	// add any additional tokens
-	addIn, err := os.ReadFile(filepath.Join(dirpath, "added_tokens.json"))
-	if os.IsNotExist(err) {
-		return v, nil
-	} else if err != nil {
-		return nil, err
-	}
-
-	slog.Info("reading user defined tokens")
-
-	var extraTokenData map[string]int
-	if err := json.Unmarshal(addIn, &extraTokenData); err != nil {
-		return nil, err
-	}
-
-	type token struct {
-		key string
-		pos int
-	}
-
-	extraTokens := make([]token, 0)
-	for k, id := range extraTokenData {
-		extraTokens = append(extraTokens, token{k, id})
-	}
-
-	slices.SortFunc(extraTokens, func(a, b token) int {
-		return cmp.Compare(a.pos, b.pos)
-	})
-
-	numToks := len(v.Tokens)
-
-	for cnt, t := range extraTokens {
-		// the token id should match the specific index for the total number of tokens
-		if t.pos != cnt+numToks {
-			return nil, fmt.Errorf("token ID '%d' for '%s' doesn't match total token size", t.pos, t.key)
-		}
-		v.Tokens = append(v.Tokens, t.key)
-		v.Scores = append(v.Scores, -1000.0)
-		v.Types = append(v.Types, tokenTypeUserDefined)
-	}
-	slog.Info(fmt.Sprintf("vocab size w/ extra tokens: %d", len(v.Tokens)))
-
-	if params.VocabSize > len(v.Tokens) {
-		missingTokens := params.VocabSize - len(v.Tokens)
-		slog.Warn(fmt.Sprintf("vocab is missing %d tokens", missingTokens))
-		for cnt := range missingTokens {
-			v.Tokens = append(v.Tokens, fmt.Sprintf("<dummy%05d>", cnt+1))
-			v.Scores = append(v.Scores, -1)
-			v.Types = append(v.Types, tokenTypeUserDefined)
+	if vocabSize := int(p.VocabSize); vocabSize > len(t.Vocabulary.Tokens) {
+		slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", p.VocabSize, "actual", len(t.Vocabulary.Tokens))
+		for i := range vocabSize - len(t.Vocabulary.Tokens) {
+			t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
+			t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
+			t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
 		}
 	}
 
-	return v, nil
+	ts, err := parseTensors(d)
+	if err != nil {
+		return err
+	}
+
+	return c.writeFile(ws, c.KV(t), c.Tensors(ts))
 }
diff --git a/convert/convert_gemma.go b/convert/convert_gemma.go
new file mode 100644
index 00000000..332fee7f
--- /dev/null
+++ b/convert/convert_gemma.go
@@ -0,0 +1,103 @@
+package convert
+
+import (
+	"strings"
+
+	"github.com/pdevine/tensor"
+	"github.com/pdevine/tensor/native"
+
+	"github.com/ollama/ollama/llm"
+)
+
+type gemma struct {
+	Parameters
+	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
+	HiddenSize            uint32  `json:"hidden_size"`
+	HiddenLayers          uint32  `json:"num_hidden_layers"`
+	IntermediateSize      uint32  `json:"intermediate_size"`
+	NumAttentionHeads     uint32  `json:"num_attention_heads"`
+	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
+	RMSNormEPS            float32 `json:"rms_norm_eps"`
+	HeadDim               uint32  `json:"head_dim"`
+}
+
+var _ Converter = (*gemma)(nil)
+
+func (p *gemma) KV(t *Tokenizer) llm.KV {
+	kv := p.Parameters.KV(t)
+	kv["general.architecture"] = "gemma"
+	kv["general.name"] = "gemma"
+	kv["gemma.context_length"] = p.MaxPositionEmbeddings
+	kv["gemma.embedding_length"] = p.HiddenSize
+	kv["gemma.block_count"] = p.HiddenLayers
+	kv["gemma.feed_forward_length"] = p.IntermediateSize
+	kv["gemma.attention.head_count"] = p.NumAttentionHeads
+	kv["gemma.attention.head_count_kv"] = p.NumKeyValueHeads
+	kv["gemma.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
+	kv["gemma.attention.key_length"] = p.HeadDim
+	kv["gemma.attention.value_length"] = p.HeadDim
+	kv["tokenizer.ggml.eot_token_id"] = uint32(107)
+	kv["tokenizer.ggml.middle_token_id"] = uint32(68)
+	kv["tokenizer.ggml.prefix_token_id"] = uint32(67)
+	kv["tokenizer.ggml.suffix_token_id"] = uint32(69)
+	return kv
+}
+
+func (p *gemma) Tensors(ts []Tensor) []*llm.Tensor {
+	var out []*llm.Tensor
+	for _, t := range ts {
+		name := p.tensorName(t.Name())
+		if strings.HasSuffix(name, "_norm.weight") {
+			t.SetRepacker(p.addOne)
+		}
+
+		out = append(out, &llm.Tensor{
+			Name:     name,
+			Kind:     t.Kind(),
+			Shape:    t.Shape(),
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (p *gemma) tensorName(n string) string {
+	return strings.NewReplacer(
+		"model.embed_tokens", "token_embd",
+		"model.norm", "output_norm",
+		"model.layers", "blk",
+		"input_layernorm", "attn_norm",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.o_proj", "attn_output",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.down_proj", "ffn_down",
+		"mlp.up_proj", "ffn_up",
+		"post_attention_layernorm", "ffn_norm",
+		"block_sparse_moe.gate", "ffn_inp",
+	).Replace(n)
+}
+
+func (*gemma) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
+	n := tensor.New(tensor.WithShape(int(shape[0])), tensor.WithBacking(data))
+	ones := tensor.Ones(tensor.Float32, int(shape[0]))
+
+	n, err := n.Add(ones)
+	if err != nil {
+		return nil, err
+	}
+
+	ts, err := native.SelectF32(n, 0)
+	if err != nil {
+		return nil, err
+	}
+
+	var f32s []float32
+	for _, t := range ts {
+		f32s = append(f32s, t...)
+	}
+
+	return f32s, nil
+}
diff --git a/convert/convert_llama.go b/convert/convert_llama.go
new file mode 100644
index 00000000..700049d3
--- /dev/null
+++ b/convert/convert_llama.go
@@ -0,0 +1,182 @@
+package convert
+
+import (
+	"cmp"
+	"fmt"
+	"strings"
+
+	"github.com/ollama/ollama/llm"
+	"github.com/pdevine/tensor"
+	"github.com/pdevine/tensor/native"
+)
+
+type llama struct {
+	Parameters
+	NLayers               uint32  `json:"n_layers"`
+	NumHiddenLayers       uint32  `json:"num_hidden_layers"`
+	NLayer                uint32  `json:"n_layer"`
+	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
+	NCtx                  uint32  `json:"n_ctx"`
+	HiddenSize            uint32  `json:"hidden_size"`
+	NEmbd                 uint32  `json:"n_embd"`
+	IntermediateSize      uint32  `json:"intermediate_size"`
+	NInner                uint32  `json:"n_inner"`
+	NumAttentionHeads     uint32  `json:"num_attention_heads"`
+	NHead                 uint32  `json:"n_head"`
+	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
+	RopeTheta             float32 `json:"rope_theta"`
+	RopeScaling           struct {
+		Type   string  `json:"type"`
+		Factor float32 `json:"factor"`
+	} `json:"rope_scaling"`
+	RMSNormEPS       float32 `json:"rms_norm_eps"`
+	LayerNormEPS     float32 `json:"layer_norm_eps"`
+	LayerNormEpsilon float32 `json:"layer_norm_epsilon"`
+	NormEpsilon      float32 `json:"norm_epsilon"`
+	HeadDim          uint32  `json:"head_dim"`
+}
+
+var _ Converter = (*llama)(nil)
+
+func (p *llama) KV(t *Tokenizer) llm.KV {
+	kv := p.Parameters.KV(t)
+	kv["general.architecture"] = "llama"
+	kv["general.name"] = "llama"
+	kv["llama.vocab_size"] = p.VocabSize
+
+	kv["llama.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
+
+	if contextLength := cmp.Or(p.MaxPositionEmbeddings, p.NCtx); contextLength > 0 {
+		kv["llama.context_length"] = contextLength
+	}
+
+	if embeddingLength := cmp.Or(p.HiddenSize, p.NEmbd); embeddingLength > 0 {
+		kv["llama.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
+	}
+
+	if feedForwardLength := cmp.Or(p.IntermediateSize, p.NInner); feedForwardLength > 0 {
+		kv["llama.feed_forward_length"] = cmp.Or(p.IntermediateSize, p.NInner)
+	}
+
+	if headCount := cmp.Or(p.NumAttentionHeads, p.NHead); headCount > 0 {
+		kv["llama.attention.head_count"] = cmp.Or(p.NumAttentionHeads, p.NHead)
+		kv["llama.rope.dimension_count"] = p.HiddenSize / headCount
+	}
+
+	if p.RopeTheta > 0 {
+		kv["llama.rope.freq_base"] = p.RopeTheta
+	}
+
+	if p.RopeScaling.Type == "linear" {
+		kv["llama.rope.scaling.type"] = p.RopeScaling.Type
+		kv["llama.rope.scaling.factor"] = p.RopeScaling.Factor
+	}
+
+	if p.NumKeyValueHeads > 0 {
+		kv["llama.attention.head_count_kv"] = p.NumKeyValueHeads
+	}
+
+	if p.RMSNormEPS > 0 {
+		kv["llama.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
+	}
+
+	if layerNormEpsilon := cmp.Or(p.LayerNormEPS, p.LayerNormEpsilon, p.NormEpsilon); layerNormEpsilon > 0 {
+		kv["llama.attention.layer_norm_epsilon"] = layerNormEpsilon
+	}
+
+	if p.HeadDim > 0 {
+		kv["llama.attention.key_length"] = p.HeadDim
+		kv["llama.attention.value_length"] = p.HeadDim
+	}
+
+	if len(t.Merges) > 0 {
+		kv["tokenizer.ggml.merges"] = t.Merges
+	}
+
+	return kv
+}
+
+func (p *llama) Tensors(ts []Tensor) []*llm.Tensor {
+	var out []*llm.Tensor
+	for _, t := range ts {
+		name := p.tensorName(t.Name())
+		if strings.HasSuffix(name, "attn_q.weight") ||
+			strings.HasSuffix(name, "attn_k.weight") {
+			t.SetRepacker(p.repack)
+		}
+
+		out = append(out, &llm.Tensor{
+			Name:     name,
+			Kind:     t.Kind(),
+			Shape:    t.Shape(),
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (p *llama) tensorName(n string) string {
+	return strings.NewReplacer(
+		"lm_head", "output",
+		"model.embed_tokens", "token_embd",
+		"model.norm", "output_norm",
+		"model.layers", "blk",
+		"input_layernorm", "attn_norm",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.o_proj", "attn_output",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.down_proj", "ffn_down",
+		"mlp.up_proj", "ffn_up",
+		"post_attention_layernorm", "ffn_norm",
+		// mixtral
+		"block_sparse_moe.gate", "ffn_gate_inp",
+	).Replace(n)
+}
+
+func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32, error) {
+	var dims []int
+	for _, dim := range shape {
+		dims = append(dims, int(dim))
+	}
+
+	var heads uint32
+	if strings.HasSuffix(name, "q_proj.weight") {
+		heads = p.NumAttentionHeads
+	} else if strings.HasSuffix(name, "k_proj.weight") {
+		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
+	} else {
+		return nil, fmt.Errorf("unknown tensor for repack: %s", name)
+	}
+
+	n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+	if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
+		return nil, err
+	}
+
+	if err := n.T(0, 2, 1, 3); err != nil {
+		return nil, err
+	}
+
+	if err := n.Reshape(dims...); err != nil {
+		return nil, err
+	}
+
+	if err := n.Transpose(); err != nil {
+		return nil, err
+	}
+
+	ts, err := native.SelectF32(n, 1)
+	if err != nil {
+		return nil, err
+	}
+
+	var f32s []float32
+	for _, t := range ts {
+		f32s = append(f32s, t...)
+	}
+
+	return f32s, nil
+}
diff --git a/convert/convert_mixtral.go b/convert/convert_mixtral.go
new file mode 100644
index 00000000..c55a27f8
--- /dev/null
+++ b/convert/convert_mixtral.go
@@ -0,0 +1,89 @@
+package convert
+
+import (
+	"fmt"
+	"io"
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/llm"
+)
+
+type mixtral struct {
+	llama
+	NumLocalExperts    uint32 `json:"num_local_experts"`
+	NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
+}
+
+var _ Converter = (*mixtral)(nil)
+
+func (p *mixtral) KV(t *Tokenizer) llm.KV {
+	kv := p.llama.KV(t)
+
+	if p.NumLocalExperts > 0 {
+		kv["llama.expert_count"] = p.NumLocalExperts
+	}
+
+	if p.NumExpertsPerToken > 0 {
+		kv["llama.expert_used_count"] = p.NumExpertsPerToken
+	}
+
+	return kv
+}
+
+func (p *mixtral) Tensors(ts []Tensor) []*llm.Tensor {
+	oldnew := []string{
+		"model.layers", "blk",
+		"w1", "ffn_gate_exps",
+		"w2", "ffn_down_exps",
+		"w3", "ffn_up_exps",
+	}
+
+	for i := range p.NumLocalExperts {
+		oldnew = append(oldnew, fmt.Sprintf(".block_sparse_moe.experts.%d.", i), ".")
+	}
+
+	// group experts of the same layer (model.layers.%d) and type (w[123]) into a single tensor
+	namer := strings.NewReplacer(oldnew...)
+	experts := make(map[string]experts)
+
+	// merge experts into a single tensor while removing them from ts
+	ts = slices.DeleteFunc(ts, func(t Tensor) bool {
+		if !strings.Contains(t.Name(), ".block_sparse_moe.experts.") {
+			return false
+		}
+
+		name := namer.Replace(t.Name())
+		experts[name] = append(experts[name], t)
+		return true
+	})
+
+	var out []*llm.Tensor
+	for n, e := range experts {
+		// TODO(mxyng): sanity check experts
+		out = append(out, &llm.Tensor{
+			Name:     n,
+			Kind:     e[0].Kind(),
+			Shape:    append([]uint64{uint64(len(e))}, e[0].Shape()...),
+			WriterTo: e,
+		})
+	}
+
+	return append(out, p.llama.Tensors(ts)...)
+}
+
+type experts []Tensor
+
+func (e experts) WriteTo(w io.Writer) (int64, error) {
+	// TODO(mxyng): experts _should_ be numerically sorted by expert but this should check
+	for _, t := range e {
+		// the canonical merged experts tensor stacks all experts along a new, 0 axis,
+		// e.g. `tensor.Stack(0, e[0], e[1:]...)`, which requires allocating temporary buffers
+		// this accomplishes the same thing by writing each expert tensor in sequence
+		if _, err := t.WriteTo(w); err != nil {
+			return 0, err
+		}
+	}
+
+	return 0, nil
+}
diff --git a/convert/convert_test.go b/convert/convert_test.go
index a3727bed..0fbd436f 100644
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -20,36 +20,13 @@ import (
 func convertFull(t *testing.T, d string) (*os.File, llm.KV, llm.Tensors) {
 	t.Helper()
 
-	mf, err := GetModelFormat(d)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	params, err := mf.GetParams(d)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	arch, err := mf.GetModelArch("", d, params)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if err := arch.LoadVocab(); err != nil {
-		t.Fatal(err)
-	}
-
-	if err := arch.GetTensors(); err != nil {
-		t.Fatal(err)
-	}
-
 	f, err := os.CreateTemp(t.TempDir(), "f16")
 	if err != nil {
 		t.Fatal(err)
 	}
 	defer f.Close()
 
-	if err := arch.WriteGGUF(f); err != nil {
+	if err := Convert(d, f); err != nil {
 		t.Fatal(err)
 	}
 
diff --git a/convert/gemma.go b/convert/gemma.go
deleted file mode 100644
index d01ffedf..00000000
--- a/convert/gemma.go
+++ /dev/null
@@ -1,102 +0,0 @@
-package convert
-
-import (
-	"fmt"
-	"io"
-	"log/slog"
-	"strings"
-
-	"github.com/pdevine/tensor"
-	"github.com/pdevine/tensor/native"
-
-	"github.com/ollama/ollama/llm"
-)
-
-type GemmaModel struct {
-	ModelData
-}
-
-func addOnes(data []float32, vectorSize int) ([]float32, error) {
-	n := tensor.New(tensor.WithShape(vectorSize), tensor.WithBacking(data))
-	ones := tensor.Ones(tensor.Float32, vectorSize)
-
-	n, err := n.Add(ones)
-	if err != nil {
-		return nil, err
-	}
-
-	ts, err := native.SelectF32(n, 0)
-	if err != nil {
-		return nil, err
-	}
-
-	var f32s []float32
-	for _, t := range ts {
-		f32s = append(f32s, t...)
-	}
-
-	return f32s, nil
-}
-
-func (m *GemmaModel) GetTensors() error {
-	t, err := m.Format.GetTensors(m.Path, m.Params)
-	if err != nil {
-		return err
-	}
-
-	slog.Debug(fmt.Sprintf("Total tensors: %d", len(t)))
-	for _, l := range t {
-		if strings.HasSuffix(l.Name, "norm.weight") {
-			wt := l.WriterTo.(safetensorWriterTo)
-			wt.repacker = m.Repack
-			l.WriterTo = wt
-		}
-		m.Tensors = append(m.Tensors, l)
-	}
-
-	return nil
-}
-
-func (m *GemmaModel) LoadVocab() error {
-	v, err := LoadSentencePieceTokens(m.Path, m.Params)
-	if err != nil {
-		return err
-	}
-	m.Vocab = v
-	return nil
-}
-
-func (m *GemmaModel) Repack(_ string, data []float32, shape []uint64) ([]float32, error) {
-	return addOnes(data, int(shape[0]))
-}
-
-func (m *GemmaModel) WriteGGUF(ws io.WriteSeeker) error {
-	kv := llm.KV{
-		"general.architecture":                   "gemma",
-		"general.name":                           m.Name,
-		"gemma.context_length":                   uint32(m.Params.ContextSize),
-		"gemma.embedding_length":                 uint32(m.Params.HiddenSize),
-		"gemma.block_count":                      uint32(m.Params.HiddenLayers),
-		"gemma.feed_forward_length":              uint32(m.Params.IntermediateSize),
-		"gemma.attention.head_count":             uint32(m.Params.AttentionHeads),
-		"gemma.attention.head_count_kv":          uint32(m.Params.KeyValHeads),
-		"gemma.attention.layer_norm_rms_epsilon": float32(m.Params.NormEPS),
-		"gemma.attention.key_length":             uint32(m.Params.HeadDimension),
-		"gemma.attention.value_length":           uint32(m.Params.HeadDimension),
-		"general.file_type":                      uint32(1),
-		"tokenizer.ggml.model":                   "llama",
-
-		"tokenizer.ggml.tokens":     m.Vocab.Tokens,
-		"tokenizer.ggml.scores":     m.Vocab.Scores,
-		"tokenizer.ggml.token_type": m.Vocab.Types,
-
-		"tokenizer.ggml.bos_token_id":     uint32(m.Params.BoSTokenID),
-		"tokenizer.ggml.eos_token_id":     uint32(m.Params.EoSTokenID),
-		"tokenizer.ggml.padding_token_id": uint32(m.Params.PaddingTokenID),
-		"tokenizer.ggml.unknown_token_id": uint32(3),
-		"tokenizer.ggml.add_bos_token":    true,
-		"tokenizer.ggml.add_eos_token":    false,
-	}
-
-	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
-}
diff --git a/convert/llama.go b/convert/llama.go
deleted file mode 100644
index b4211b02..00000000
--- a/convert/llama.go
+++ /dev/null
@@ -1,159 +0,0 @@
-package convert
-
-import (
-	"cmp"
-	"errors"
-	"fmt"
-	"io"
-	"os"
-	"path/filepath"
-	"regexp"
-	"strings"
-
-	"github.com/pdevine/tensor"
-	"github.com/pdevine/tensor/native"
-
-	"github.com/ollama/ollama/llm"
-)
-
-type LlamaModel struct {
-	ModelData
-}
-
-func (m *LlamaModel) GetTensors() error {
-	t, err := m.Format.GetTensors(m.Path, m.Params)
-	if err != nil {
-		return err
-	}
-
-	pattern := `^blk\.[0-9]+\.attn_(?P<layer>q|k)\.weight$`
-	re, err := regexp.Compile(pattern)
-	if err != nil {
-		return err
-	}
-
-	for _, l := range t {
-		matches := re.FindAllStringSubmatch(l.Name, -1)
-		if len(matches) > 0 {
-			switch m.Format.(type) {
-			case *TorchFormat:
-				wt := l.WriterTo.(torchWriterTo)
-				wt.repacker = m.Repack
-				l.WriterTo = wt
-			case *SafetensorFormat:
-				wt := l.WriterTo.(safetensorWriterTo)
-				wt.repacker = m.Repack
-				l.WriterTo = wt
-			}
-		}
-		m.Tensors = append(m.Tensors, l)
-	}
-
-	return nil
-}
-
-func (m *LlamaModel) LoadVocab() (err error) {
-	pre, ts, merges, err := parseTokens(filepath.Join(m.Path, "tokenizer.json"))
-	if errors.Is(err, os.ErrNotExist) {
-		return nil
-	} else if err != nil {
-		return err
-	}
-
-	m.Vocab = &Vocab{}
-	for _, t := range ts {
-		m.Vocab.Tokens = append(m.Vocab.Tokens, t.Content)
-		m.Vocab.Types = append(m.Vocab.Types, t.Type())
-	}
-
-	m.Vocab.Merges = merges
-	m.Params.PreTokenizer = pre
-	return nil
-}
-
-func (m *LlamaModel) WriteGGUF(ws io.WriteSeeker) error {
-	kv := llm.KV{
-		"general.architecture":                   "llama",
-		"general.name":                           m.Name,
-		"llama.vocab_size":                       uint32(len(m.Vocab.Tokens)),
-		"llama.context_length":                   uint32(m.Params.ContextSize),
-		"llama.embedding_length":                 uint32(m.Params.HiddenSize),
-		"llama.block_count":                      uint32(m.Params.HiddenLayers),
-		"llama.feed_forward_length":              uint32(m.Params.IntermediateSize),
-		"llama.rope.freq_base":                   float32(m.Params.RopeFrequencyBase),
-		"llama.rope.dimension_count":             uint32(m.Params.HiddenSize / m.Params.AttentionHeads),
-		"llama.attention.head_count":             uint32(m.Params.AttentionHeads),
-		"llama.attention.head_count_kv":          uint32(m.Params.KeyValHeads),
-		"llama.attention.layer_norm_rms_epsilon": float32(m.Params.NormEPS),
-		"general.file_type":                      uint32(1),
-		"tokenizer.ggml.model":                   "gpt2",
-
-		"tokenizer.ggml.pre":        m.Params.PreTokenizer,
-		"tokenizer.ggml.tokens":     m.Vocab.Tokens,
-		"tokenizer.ggml.token_type": m.Vocab.Types,
-
-		"tokenizer.ggml.bos_token_id":     uint32(m.Params.BoSTokenID),
-		"tokenizer.ggml.eos_token_id":     uint32(m.Params.EoSTokenID),
-		"tokenizer.ggml.unknown_token_id": uint32(0),
-	}
-
-	if len(m.Vocab.Merges) > 0 {
-		kv["tokenizer.ggml.merges"] = m.Vocab.Merges
-	} else {
-		kv["tokenizer.ggml.scores"] = m.Vocab.Scores
-	}
-
-	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
-}
-
-func (m *LlamaModel) Repack(name string, data []float32, shape []uint64) ([]float32, error) {
-	return llamaRepack(name, m.Params, data, shape)
-}
-
-func llamaRepack(name string, params *Params, data []float32, shape []uint64) ([]float32, error) {
-	var dims []int
-	for _, dim := range shape {
-		if dim != 0 {
-			dims = append(dims, int(dim))
-		}
-	}
-
-	var heads int
-	switch {
-	case strings.HasSuffix(name, "attn_q.weight"):
-		heads = params.AttentionHeads
-	case strings.HasSuffix(name, "attn_k.weight"):
-		heads = cmp.Or(params.KeyValHeads, params.AttentionHeads)
-	default:
-		return nil, fmt.Errorf("unknown tensor name: %s", name)
-	}
-
-	n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
-	if err := n.Reshape(append([]int{heads, 2, dims[0] / heads / 2}, dims[1:]...)...); err != nil {
-		return nil, err
-	}
-
-	if err := n.T(0, 2, 1, 3); err != nil {
-		return nil, err
-	}
-
-	if err := n.Reshape(dims...); err != nil {
-		return nil, err
-	}
-
-	if err := n.Transpose(); err != nil {
-		return nil, err
-	}
-
-	ts, err := native.SelectF32(n, 1)
-	if err != nil {
-		return nil, err
-	}
-
-	var f32s []float32
-	for _, t := range ts {
-		f32s = append(f32s, t...)
-	}
-
-	return f32s, nil
-}
diff --git a/convert/mistral.go b/convert/mistral.go
deleted file mode 100644
index 8fe066d6..00000000
--- a/convert/mistral.go
+++ /dev/null
@@ -1,84 +0,0 @@
-package convert
-
-import (
-	"io"
-	"regexp"
-
-	"github.com/ollama/ollama/llm"
-)
-
-type MistralModel struct {
-	ModelData
-}
-
-func (m *MistralModel) GetTensors() error {
-	t, err := m.Format.GetTensors(m.Path, m.Params)
-	if err != nil {
-		return err
-	}
-
-	pattern := `^blk\.[0-9]+\.attn_(?P<layer>q|k)\.weight$`
-	re, err := regexp.Compile(pattern)
-	if err != nil {
-		return err
-	}
-
-	for _, l := range t {
-		matches := re.FindAllStringSubmatch(l.Name, -1)
-		if len(matches) > 0 {
-			wt := l.WriterTo.(safetensorWriterTo)
-			wt.repacker = m.Repack
-			l.WriterTo = wt
-		}
-		m.Tensors = append(m.Tensors, l)
-	}
-
-	return nil
-}
-
-func (m *MistralModel) LoadVocab() error {
-	v, err := LoadSentencePieceTokens(m.Path, m.Params)
-	if err != nil {
-		return err
-	}
-	m.Vocab = v
-	return nil
-}
-
-func (m *MistralModel) WriteGGUF(ws io.WriteSeeker) error {
-	kv := llm.KV{
-		"general.architecture":                   "llama",
-		"general.name":                           m.Name,
-		"llama.context_length":                   uint32(m.Params.ContextSize),
-		"llama.embedding_length":                 uint32(m.Params.HiddenSize),
-		"llama.block_count":                      uint32(m.Params.HiddenLayers),
-		"llama.feed_forward_length":              uint32(m.Params.IntermediateSize),
-		"llama.rope.dimension_count":             uint32(m.Params.HiddenSize / m.Params.AttentionHeads),
-		"llama.attention.head_count":             uint32(m.Params.AttentionHeads),
-		"llama.attention.head_count_kv":          uint32(m.Params.KeyValHeads),
-		"llama.attention.layer_norm_rms_epsilon": float32(m.Params.NormEPS),
-		"general.file_type":                      uint32(1),
-		"tokenizer.ggml.model":                   "llama",
-
-		"tokenizer.ggml.tokens":     m.Vocab.Tokens,
-		"tokenizer.ggml.scores":     m.Vocab.Scores,
-		"tokenizer.ggml.token_type": m.Vocab.Types,
-
-		"tokenizer.ggml.bos_token_id":     uint32(m.Params.BoSTokenID),
-		"tokenizer.ggml.eos_token_id":     uint32(m.Params.EoSTokenID),
-		"tokenizer.ggml.add_bos_token":    true,
-		"tokenizer.ggml.add_eos_token":    false,
-		"tokenizer.ggml.unknown_token_id": uint32(0),
-	}
-
-	if m.Params.HeadDimension > 0 {
-		kv["llama.attention.key_length"] = uint32(m.Params.HeadDimension)
-		kv["llama.attention.value_length"] = uint32(m.Params.HeadDimension)
-	}
-
-	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
-}
-
-func (m *MistralModel) Repack(name string, data []float32, shape []uint64) ([]float32, error) {
-	return llamaRepack(name, m.Params, data, shape)
-}
diff --git a/convert/mixtral.go b/convert/mixtral.go
deleted file mode 100644
index baea68cd..00000000
--- a/convert/mixtral.go
+++ /dev/null
@@ -1,87 +0,0 @@
-package convert
-
-import (
-	"io"
-	"regexp"
-
-	"github.com/ollama/ollama/llm"
-)
-
-type MixtralModel struct {
-	ModelData
-}
-
-func (m *MixtralModel) GetTensors() error {
-	t, err := m.Format.GetTensors(m.Path, m.Params)
-	if err != nil {
-		return err
-	}
-
-	pattern := `^blk\.[0-9]+\.attn_(?P<layer>q|k)\.weight$`
-	re, err := regexp.Compile(pattern)
-	if err != nil {
-		return err
-	}
-
-	for _, l := range t {
-		matches := re.FindAllStringSubmatch(l.Name, -1)
-		if len(matches) > 0 {
-			wt := l.WriterTo.(safetensorWriterTo)
-			wt.repacker = m.Repack
-			l.WriterTo = wt
-		}
-		m.Tensors = append(m.Tensors, l)
-	}
-
-	return nil
-}
-
-func (m *MixtralModel) LoadVocab() error {
-	v, err := LoadSentencePieceTokens(m.Path, m.Params)
-	if err != nil {
-		return err
-	}
-	m.Vocab = v
-	return nil
-}
-
-func (m *MixtralModel) WriteGGUF(ws io.WriteSeeker) error {
-	kv := llm.KV{
-		"general.architecture":          "llama",
-		"general.name":                  m.Name,
-		"llama.block_count":             uint32(m.Params.HiddenLayers),
-		"llama.context_length":          uint32(m.Params.ContextSize),
-		"llama.embedding_length":        uint32(m.Params.HiddenSize),
-		"llama.feed_forward_length":     uint32(m.Params.IntermediateSize),
-		"llama.attention.head_count":    uint32(m.Params.AttentionHeads),
-		"llama.attention.head_count_kv": uint32(m.Params.KeyValHeads),
-
-		"llama.rope.freq_base":                   float32(m.Params.RopeFrequencyBase),
-		"llama.attention.layer_norm_rms_epsilon": float32(m.Params.NormEPS),
-
-		"llama.expert_count":      uint32(m.Params.Experts),
-		"llama.expert_used_count": uint32(m.Params.ExpertsUsed),
-
-		"llama.vocab_size":           uint32(len(m.Vocab.Tokens)),
-		"llama.rope.dimension_count": uint32(m.Params.HiddenSize / m.Params.AttentionHeads),
-
-		"general.file_type":    uint32(1),
-		"tokenizer.ggml.model": "llama",
-
-		"tokenizer.ggml.tokens":     m.Vocab.Tokens,
-		"tokenizer.ggml.scores":     m.Vocab.Scores,
-		"tokenizer.ggml.token_type": m.Vocab.Types,
-
-		"tokenizer.ggml.bos_token_id":     uint32(m.Params.BoSTokenID),
-		"tokenizer.ggml.eos_token_id":     uint32(m.Params.EoSTokenID),
-		"tokenizer.ggml.unknown_token_id": uint32(0),
-		"tokenizer.ggml.add_bos_token":    true,
-		"tokenizer.ggml.add_eos_token":    false,
-	}
-
-	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
-}
-
-func (m *MixtralModel) Repack(name string, data []float32, shape []uint64) ([]float32, error) {
-	return llamaRepack(name, m.Params, data, shape)
-}
diff --git a/convert/reader.go b/convert/reader.go
new file mode 100644
index 00000000..9be8ac2e
--- /dev/null
+++ b/convert/reader.go
@@ -0,0 +1,74 @@
+package convert
+
+import (
+	"errors"
+	"io"
+	"path/filepath"
+	"strings"
+)
+
+type Tensor interface {
+	Name() string
+	Shape() []uint64
+	Kind() uint32
+	SetRepacker(repacker)
+	WriteTo(io.Writer) (int64, error)
+}
+
+type tensorBase struct {
+	name  string
+	shape []uint64
+	repacker
+}
+
+func (t tensorBase) Name() string {
+	return t.name
+}
+
+func (t tensorBase) Shape() []uint64 {
+	return t.shape
+}
+
+func (t tensorBase) Kind() uint32 {
+	if strings.HasSuffix(t.name, ".block_sparse_moe.gate.weight") {
+		return 0
+	}
+
+	switch len(t.shape) {
+	case 0:
+		panic("invalid tensor shape")
+	case 1:
+		return 0
+	default:
+		return 1
+	}
+}
+
+func (t *tensorBase) SetRepacker(fn repacker) {
+	t.repacker = fn
+}
+
+type repacker func(string, []float32, []uint64) ([]float32, error)
+
+func parseTensors(d string) ([]Tensor, error) {
+	patterns := map[string]func(...string) ([]Tensor, error){
+		"model-*-of-*.safetensors": parseSafetensors,
+		"model.safetensors":        parseSafetensors,
+		"pytorch_model-*-of-*.bin": parseTorch,
+		"pytorch_model.bin":        parseTorch,
+		"consolidated.*.pth":       parseTorch,
+	}
+
+	for pattern, parseFn := range patterns {
+		matches, err := filepath.Glob(filepath.Join(d, pattern))
+		if err != nil {
+			return nil, err
+		}
+
+		if len(matches) > 0 {
+			return parseFn(matches...)
+		}
+	}
+
+	return nil, errors.New("unknown tensor format")
+}
diff --git a/convert/reader_safetensors.go b/convert/reader_safetensors.go
new file mode 100644
index 00000000..440581af
--- /dev/null
+++ b/convert/reader_safetensors.go
@@ -0,0 +1,140 @@
+package convert
+
+import (
+	"bytes"
+	"encoding/binary"
+	"encoding/json"
+	"fmt"
+	"io"
+	"os"
+	"slices"
+
+	"github.com/d4l3k/go-bfloat16"
+	"github.com/x448/float16"
+	"golang.org/x/exp/maps"
+)
+
+type safetensorMetadata struct {
+	Type    string   `json:"dtype"`
+	Shape   []uint64 `json:"shape"`
+	Offsets []int64  `json:"data_offsets"`
+}
+
+func parseSafetensors(ps ...string) ([]Tensor, error) {
+	var ts []Tensor
+	for _, p := range ps {
+		f, err := os.Open(p)
+		if err != nil {
+			return nil, err
+		}
+		defer f.Close()
+
+		var n int64
+		if err := binary.Read(f, binary.LittleEndian, &n); err != nil {
+			return nil, err
+		}
+
+		b := bytes.NewBuffer(make([]byte, 0, n))
+		if _, err = io.CopyN(b, f, n); err != nil {
+			return nil, err
+		}
+
+		var headers map[string]safetensorMetadata
+		if err := json.NewDecoder(b).Decode(&headers); err != nil {
+			return nil, err
+		}
+
+		keys := maps.Keys(headers)
+		slices.Sort(keys)
+
+		for _, key := range keys {
+			if value := headers[key]; value.Type != "" {
+				ts = append(ts, safetensor{
+					path:   p,
+					dtype:  value.Type,
+					offset: safetensorsPad(n, value.Offsets[0]),
+					size:   safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]),
+					tensorBase: &tensorBase{
+						name:  key,
+						shape: value.Shape,
+					},
+				})
+			}
+		}
+	}
+
+	return ts, nil
+}
+
+func safetensorsPad(n, s int64) int64 {
+	return 8 + n + s
+}
+
+type safetensor struct {
+	path   string
+	dtype  string
+	offset int64
+	size   int64
+	*tensorBase
+}
+
+func (st safetensor) WriteTo(w io.Writer) (int64, error) {
+	f, err := os.Open(st.path)
+	if err != nil {
+		return 0, err
+	}
+	defer f.Close()
+
+	if _, err = f.Seek(st.offset, io.SeekStart); err != nil {
+		return 0, err
+	}
+
+	var f32s []float32
+	switch st.dtype {
+	case "F32":
+		f32s = make([]float32, st.size/4)
+		if err = binary.Read(f, binary.LittleEndian, f32s); err != nil {
+			return 0, err
+		}
+	case "F16":
+		u16s := make([]uint16, st.size/2)
+		if err = binary.Read(f, binary.LittleEndian, u16s); err != nil {
+			return 0, err
+		}
+
+		for _, b := range u16s {
+			f32s = append(f32s, float16.Frombits(b).Float32())
+		}
+
+	case "BF16":
+		u8s := make([]uint8, st.size)
+		if err = binary.Read(f, binary.LittleEndian, u8s); err != nil {
+			return 0, err
+		}
+
+		f32s = bfloat16.DecodeFloat32(u8s)
+	default:
+		return 0, fmt.Errorf("unknown data type: %s", st.dtype)
+	}
+
+	if st.repacker != nil {
+		f32s, err = st.repacker(st.Name(), f32s, st.Shape())
+		if err != nil {
+			return 0, err
+		}
+	}
+
+	switch st.Kind() {
+	case 0:
+		return 0, binary.Write(w, binary.LittleEndian, f32s)
+	case 1:
+		f16s := make([]uint16, len(f32s))
+		for i := range f32s {
+			f16s[i] = float16.Fromfloat32(f32s[i]).Bits()
+		}
+
+		return 0, binary.Write(w, binary.LittleEndian, f16s)
+	default:
+		return 0, fmt.Errorf("unknown storage type: %d", st.Kind())
+	}
+}
diff --git a/convert/reader_torch.go b/convert/reader_torch.go
new file mode 100644
index 00000000..1428706e
--- /dev/null
+++ b/convert/reader_torch.go
@@ -0,0 +1,46 @@
+package convert
+
+import (
+	"io"
+
+	"github.com/nlpodyssey/gopickle/pytorch"
+	"github.com/nlpodyssey/gopickle/types"
+)
+
+func parseTorch(ps ...string) ([]Tensor, error) {
+	var ts []Tensor
+	for _, p := range ps {
+		pt, err := pytorch.Load(p)
+		if err != nil {
+			return nil, err
+		}
+
+		for _, k := range pt.(*types.Dict).Keys() {
+			t := pt.(*types.Dict).MustGet(k)
+
+			var shape []uint64
+			for dim := range t.(*pytorch.Tensor).Size {
+				shape = append(shape, uint64(dim))
+			}
+
+			ts = append(ts, torch{
+				storage: t.(*pytorch.Tensor).Source,
+				tensorBase: &tensorBase{
+					name:  k.(string),
+					shape: shape,
+				},
+			})
+		}
+	}
+
+	return ts, nil
+}
+
+type torch struct {
+	storage pytorch.StorageInterface
+	*tensorBase
+}
+
+func (pt torch) WriteTo(w io.Writer) (int64, error) {
+	return 0, nil
+}
diff --git a/convert/safetensors.go b/convert/safetensors.go
deleted file mode 100644
index f45687f1..00000000
--- a/convert/safetensors.go
+++ /dev/null
@@ -1,309 +0,0 @@
-package convert
-
-import (
-	"bytes"
-	"encoding/binary"
-	"encoding/json"
-	"fmt"
-	"io"
-	"os"
-	"path/filepath"
-	"regexp"
-	"slices"
-	"strings"
-
-	"github.com/d4l3k/go-bfloat16"
-	"github.com/x448/float16"
-
-	"github.com/ollama/ollama/llm"
-)
-
-type safetensorWriterTo struct {
-	t *llm.Tensor
-
-	params *Params
-	bo     ByteOrder
-
-	filename string
-	dtype    string
-
-	offset, size int64
-	repacker     func(string, []float32, []uint64) ([]float32, error)
-}
-
-type safetensorMetadata struct {
-	Type    string   `json:"dtype"`
-	Shape   []uint64 `json:"shape"`
-	Offsets []int64  `json:"data_offsets"`
-}
-
-type SafetensorFormat struct{}
-
-func (m *SafetensorFormat) GetTensors(dirpath string, params *Params) ([]llm.Tensor, error) {
-	var tensors []llm.Tensor
-	matches, err := filepath.Glob(filepath.Join(dirpath, "*.safetensors"))
-	if err != nil {
-		return nil, err
-	}
-
-	var offset uint64
-	for _, f := range matches {
-		var t []llm.Tensor
-		var err error
-		t, offset, err = m.readTensors(f, offset, params)
-		if err != nil {
-			return nil, err
-		}
-
-		tensors = append(tensors, t...)
-	}
-	return tensors, nil
-}
-
-func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params) ([]llm.Tensor, uint64, error) {
-	f, err := os.Open(fn)
-	if err != nil {
-		return nil, 0, err
-	}
-	defer f.Close()
-
-	var n int64
-	if err := binary.Read(f, binary.LittleEndian, &n); err != nil {
-		return nil, 0, err
-	}
-
-	b := bytes.NewBuffer(make([]byte, 0, n))
-	if _, err = io.CopyN(b, f, n); err != nil {
-		return nil, 0, err
-	}
-
-	var headers map[string]safetensorMetadata
-	if err := json.NewDecoder(b).Decode(&headers); err != nil {
-		return nil, 0, err
-	}
-
-	var keys []string
-	for key := range headers {
-		if !strings.HasSuffix(key, "self_attn.rotary_embd.inv_freq") {
-			keys = append(keys, key)
-		}
-	}
-
-	slices.Sort(keys)
-
-	var tensors []llm.Tensor
-	for _, key := range keys {
-		value := headers[key]
-
-		var kind uint32
-		switch len(value.Shape) {
-		case 0:
-			// valuedata
-			continue
-		case 2:
-			kind = 1
-		}
-
-		name, err := m.GetLayerName(key)
-		if err != nil {
-			return nil, 0, err
-		}
-
-		shape := make([]uint64, len(value.Shape))
-		copy(shape, value.Shape)
-
-		pad := func(s int64) int64 {
-			return 8 + n + s
-		}
-
-		t := llm.Tensor{
-			Name:   name,
-			Kind:   kind,
-			Offset: offset,
-			Shape:  shape,
-		}
-
-		t.WriterTo = safetensorWriterTo{
-			t:        &t,
-			params:   params,
-			bo:       params.ByteOrder,
-			filename: fn,
-			dtype:    value.Type,
-			offset:   pad(value.Offsets[0]),
-			size:     pad(value.Offsets[1]) - pad(value.Offsets[0]),
-		}
-
-		offset += t.Size()
-		tensors = append(tensors, t)
-	}
-
-	return tensors, offset, nil
-}
-
-func (m *SafetensorFormat) GetParams(dirpath string) (*Params, error) {
-	f, err := os.Open(filepath.Join(dirpath, "config.json"))
-	if err != nil {
-		return nil, err
-	}
-	defer f.Close()
-
-	var params Params
-
-	if err := json.NewDecoder(f).Decode(&params); err != nil {
-		return nil, err
-	}
-
-	params.ByteOrder = binary.LittleEndian
-	return &params, nil
-}
-
-func (m *SafetensorFormat) GetLayerName(n string) (string, error) {
-	directMap := map[string]string{
-		"model.embed_tokens.weight": "token_embd.weight",
-		"lm_head.weight":            "output.weight",
-		"model.norm.weight":         "output_norm.weight",
-	}
-
-	tMap := map[string]string{
-		"model.layers.(\\d+).input_layernorm.weight":                    "blk.$1.attn_norm.weight",
-		"model.layers.(\\d+).mlp.down_proj.weight":                      "blk.$1.ffn_down.weight",
-		"model.layers.(\\d+).mlp.gate_proj.weight":                      "blk.$1.ffn_gate.weight",
-		"model.layers.(\\d+).mlp.up_proj.weight":                        "blk.$1.ffn_up.weight",
-		"model.layers.(\\d+).post_attention_layernorm.weight":           "blk.$1.ffn_norm.weight",
-		"model.layers.(\\d+).self_attn.k_proj.weight":                   "blk.$1.attn_k.weight",
-		"model.layers.(\\d+).self_attn.o_proj.weight":                   "blk.$1.attn_output.weight",
-		"model.layers.(\\d+).self_attn.q_proj.weight":                   "blk.$1.attn_q.weight",
-		"model.layers.(\\d+).self_attn.v_proj.weight":                   "blk.$1.attn_v.weight",
-		"model.layers.(\\d+).block_sparse_moe.gate.weight":              "blk.$1.ffn_gate_inp.weight",
-		"model.layers.(\\d+).block_sparse_moe.experts.(\\d+).w1.weight": "blk.$1.ffn_gate.$2.weight",
-		"model.layers.(\\d+).block_sparse_moe.experts.(\\d+).w2.weight": "blk.$1.ffn_down.$2.weight",
-		"model.layers.(\\d+).block_sparse_moe.experts.(\\d+).w3.weight": "blk.$1.ffn_up.$2.weight",
-	}
-
-	v, ok := directMap[n]
-	if ok {
-		return v, nil
-	}
-
-	// quick hack to rename the layers to gguf format
-	for k, v := range tMap {
-		re := regexp.MustCompile(k)
-		newName := re.ReplaceAllString(n, v)
-		if newName != n {
-			return newName, nil
-		}
-	}
-
-	return "", fmt.Errorf("couldn't find a layer name for '%s'", n)
-}
-
-func (r safetensorWriterTo) WriteTo(w io.Writer) (n int64, err error) {
-	f, err := os.Open(r.filename)
-	if err != nil {
-		return 0, err
-	}
-	defer f.Close()
-
-	if _, err = f.Seek(r.offset, io.SeekStart); err != nil {
-		return 0, err
-	}
-
-	var f32s []float32
-	switch r.dtype {
-	case "F32":
-		f32s = make([]float32, r.size/4)
-		if err = binary.Read(f, r.bo, f32s); err != nil {
-			return 0, err
-		}
-	case "F16":
-		u16s := make([]uint16, r.size/2)
-		if err = binary.Read(f, r.bo, u16s); err != nil {
-			return 0, err
-		}
-
-		for _, b := range u16s {
-			f32s = append(f32s, float16.Frombits(b).Float32())
-		}
-
-	case "BF16":
-		u8s := make([]uint8, r.size)
-		if err = binary.Read(f, r.bo, u8s); err != nil {
-			return 0, err
-		}
-
-		f32s = bfloat16.DecodeFloat32(u8s)
-	default:
-		return 0, fmt.Errorf("unknown data type: %s", r.dtype)
-	}
-
-	if r.repacker != nil {
-		f32s, err = r.repacker(r.t.Name, f32s, r.t.Shape)
-		if err != nil {
-			return 0, err
-		}
-	}
-
-	switch r.t.Kind {
-	case 0:
-		return 0, binary.Write(w, r.bo, f32s)
-	case 1:
-		f16s := make([]uint16, len(f32s))
-		for i := range f32s {
-			f16s[i] = float16.Fromfloat32(f32s[i]).Bits()
-		}
-
-		return 0, binary.Write(w, r.bo, f16s)
-	default:
-		return 0, fmt.Errorf("unknown storage type: %d", r.t.Kind)
-	}
-}
-
-func (m *SafetensorFormat) GetModelArch(name, dirPath string, params *Params) (ModelArch, error) {
-	switch len(params.Architectures) {
-	case 0:
-		return nil, fmt.Errorf("No architecture specified to convert")
-	case 1:
-		switch params.Architectures[0] {
-		case "LlamaForCausalLM":
-			return &LlamaModel{
-				ModelData{
-					Name:   name,
-					Path:   dirPath,
-					Params: params,
-					Format: m,
-				},
-			}, nil
-		case "MistralForCausalLM":
-			return &MistralModel{
-				ModelData{
-					Name:   name,
-					Path:   dirPath,
-					Params: params,
-					Format: m,
-				},
-			}, nil
-		case "MixtralForCausalLM":
-			return &MixtralModel{
-				ModelData{
-					Name:   name,
-					Path:   dirPath,
-					Params: params,
-					Format: m,
-				},
-			}, nil
-		case "GemmaForCausalLM":
-			return &GemmaModel{
-				ModelData{
-					Name:   name,
-					Path:   dirPath,
-					Params: params,
-					Format: m,
-				},
-			}, nil
-		default:
-			return nil, fmt.Errorf("Models based on '%s' are not yet supported", params.Architectures[0])
-		}
-	}
-
-	return nil, fmt.Errorf("Unknown error")
-}
diff --git a/convert/testdata/Mistral-7B-Instruct-v0.2.json b/convert/testdata/Mistral-7B-Instruct-v0.2.json
index 1da4d2ad..88d447b3 100644
--- a/convert/testdata/Mistral-7B-Instruct-v0.2.json
+++ b/convert/testdata/Mistral-7B-Instruct-v0.2.json
@@ -4,7 +4,7 @@
   "general.quantization_version": "2",
   "llama.block_count": "32",
   "llama.context_length": "32768",
-  "llama.embedding_length": "",
+  "llama.embedding_length": "4096",
   "llama.feed_forward_length": "14336",
   "llama.attention.head_count": "32",
   "llama.attention.head_count_kv": "8",
diff --git a/convert/testdata/Mixtral-8x7B-Instruct-v0.1.json b/convert/testdata/Mixtral-8x7B-Instruct-v0.1.json
index 0967ef42..a1596532 100644
--- a/convert/testdata/Mixtral-8x7B-Instruct-v0.1.json
+++ b/convert/testdata/Mixtral-8x7B-Instruct-v0.1.json
@@ -1 +1,348 @@
-{}
+{
+  "general.architecture": "llama",
+  "general.file_type": "1",
+  "general.quantization_version": "2",
+  "llama.block_count": "32",
+  "llama.context_length": "32768",
+  "llama.embedding_length": "4096",
+  "llama.feed_forward_length": "14336",
+  "llama.rope.dimension_count": "128",
+  "llama.rope.freq_base": "1e+06",
+  "llama.attention.head_count": "32",
+  "llama.attention.head_count_kv": "8",
+  "llama.attention.layer_norm_rms_epsilon": "1e-05",
+  "llama.expert_count": "8",
+  "llama.expert_used_count": "2",
+  "tokenizer.ggml.model": "llama",
+  "tokenizer.ggml.add_bos_token": "true",
+  "tokenizer.ggml.add_eos_token": "false",
+  "tokenizer.ggml.bos_token_id": "1",
+  "tokenizer.ggml.eos_token_id": "2",
+  "tokenizer.ggml.unknown_token_id": "0",
+  "tokenizer.ggml.scores": "e3d3eea80bb41a1213f2d0aa3e8a38581d1f19323be77dbd779c9c7e3b72e676",
+  "tokenizer.ggml.token_type": "6040635e6bd38d98af06698feb75c1802bad35180ee6ae0a503e38c0f60fd71e",
+  "tokenizer.ggml.tokens": "604ac4bfbd019e430d7b6cdf18c6c0cd5b967900601f0307f714ec7773aa5ca6",
+  "token_embd.weight": "1d1d1d39a867d5a4bfb32792a47247d2638c10c95a6259391d02843583505cc4",
+  "blk.0.ffn_gate_exps.weight": "2e5cd43ac3f26c44f071926ff6c3f239ecc52a34bc9a5b5906d3d4c1bf2fbbfa",
+  "blk.0.ffn_down_exps.weight": "a4dfc7e7c96e7402eb70279601675b956bb7331da8101e63fe5c0a611b6972e5",
+  "blk.0.ffn_up_exps.weight": "2d5d87b378b2319c344ed2c642598b6f7cb6beeb582a8ea51abc9ae690d473c3",
+  "blk.0.ffn_gate_inp.weight": "a46aaf5aba7401ce6e41f158242b4879d34901661f3ede85496cbd0ce79d6314",
+  "blk.0.attn_norm.weight": "3fe37d913bdd2b65076bcdd6efe64a37b0b03cacbb1b80b9f7089068aa35f38c",
+  "blk.0.ffn_norm.weight": "5e14308a3c894734eb204c8f558bdc817e94bbd5b4e9cb4094e91ba388c8f7f2",
+  "blk.0.attn_k.weight": "73d943dcac0911e87bd771f4aa1c901e1bfe1aed293af06e1a67812159859f67",
+  "blk.0.attn_output.weight": "4c5f754c855e262e8d4c94c6fbbb57af06399dc0e170d7d99a1a17fc9aab9227",
+  "blk.0.attn_q.weight": "d6fd7403c873d49c05f6f03208f30d99ad34cb3b71c9990c47334d502a8e4c7b",
+  "blk.0.attn_v.weight": "cf17cf64b2d683bd9de6cebaf60e5c264df6fdc38fe719dde9d54c80334f6366",
+  "blk.1.ffn_gate_inp.weight": "0d524de81cd915816b4e714bf595ad6946a9130b3de731cd89428b2781230809",
+  "blk.1.attn_k.weight": "2ea47f412992b374c70674730fe84700e0c8cce177086ce9b6635e42408964bd",
+  "blk.1.attn_output.weight": "b4b2520794d54113e86c8ff678eacfc62e35be4395a594a6c8c22b4383ebcc0c",
+  "blk.1.attn_q.weight": "5db930c98c4f91f6eab57eb974c72210b158e366d23d6d2890b2759c053bee33",
+  "blk.1.attn_v.weight": "079bdde09668394bf7af9f8bc175017b4f48f0ab64e6dd855a4d7561d1693c0f",
+  "blk.1.ffn_gate_exps.weight": "146a62de19f9ab093deb101f9640534ffc3dc40d69f508be12fc0475d01b0c7a",
+  "blk.1.ffn_down_exps.weight": "949da94a3c0f375160672a979e85f7def284264b10d48d038238aad5f5ece793",
+  "blk.1.ffn_up_exps.weight": "7016a3f467d9e3f2f4b4019579ed86b757469cd367f2b225483305376b4bb3c1",
+  "blk.1.attn_norm.weight": "1614d1e6ed537737275eb888666c7bac533f4eefbe73dec92b591045ca9e1afd",
+  "blk.1.ffn_norm.weight": "405a455fa7d1ec36894652ceb554bbcb09a07fd6405f42741e66dc4a4665c19c",
+  "blk.2.ffn_gate_exps.weight": "90d5003fc7421f44220c0842d43128955e91488f6f785fe570b62d81b719e964",
+  "blk.2.ffn_down_exps.weight": "ecdc2b5a8b504ef0a7833acff47d69b0c1fa9c22126de1bb120ff5e48c3d6e2c",
+  "blk.2.ffn_up_exps.weight": "2cbd9485a32460d315eb50a2f3b00863fd77245bfe885b7565efac1cdb1f191e",
+  "blk.2.ffn_gate_inp.weight": "0d0a17a1a2c7a61f2cca49ecbb479154dc93a870873257bc4f225e7607f2e2c2",
+  "blk.2.attn_norm.weight": "b2e4c5a977f87a6f880896bd73596234c9b83622fa0d7add5892501e3155913c",
+  "blk.2.ffn_norm.weight": "0ab875b4280afa922376cfc7b9aa3f7071c9432ea1254091ce7de3749df0e8e6",
+  "blk.2.attn_k.weight": "bb884af51fb51550acfef54ccf1b58ce8284e587806e6a2f88c8265e1ad05a5e",
+  "blk.2.attn_output.weight": "0f03099ba1ef342ea61af9cd71d028123bbd8b1dd7d7fd9b509aef77815427d9",
+  "blk.2.attn_q.weight": "8fad0d29eb4c9d24e564774ee3316b9eb7a4c4985e4567111d2c836c830f6cf3",
+  "blk.2.attn_v.weight": "fe04c847ff677632401a94e7b6b6fdca60391ab21cb23bd791533115de6303a1",
+  "blk.3.ffn_gate_inp.weight": "29e3aaa724590c070e614af8288939603d2641b0ef11e8c0f476bebb2776673c",
+  "blk.3.attn_k.weight": "231cc5631def10f7f292d8862d6125ff555164cd70480ac76362149fad204497",
+  "blk.3.attn_output.weight": "86467a605c62852e05fda1a7ef43150df2cf715fe59785dbcba09f1c27cfa086",
+  "blk.3.attn_q.weight": "901822402453922225c2d6ac79616691d48217635d5ff7338daa971d5ddee210",
+  "blk.3.attn_v.weight": "27030784f44375720df2f090933645a31a022d3fb3b14573e5ca0b78f44070c1",
+  "blk.3.ffn_gate_exps.weight": "231ba59cc0b988d125d77bf627aa3f04636684870af88f081f3944b48a160d86",
+  "blk.3.ffn_down_exps.weight": "530c3ab44ae4d66e8afa4d10c153ba5dfcdfb7321989a988e62e9d12e7234625",
+  "blk.3.ffn_up_exps.weight": "b85c2d4d9d11332e702b3c0a6610d4f525f9a93e5d12f5c7c55c592c40755e75",
+  "blk.3.attn_norm.weight": "05dbb6d88cfa6b199f9d705ccbda97c0ef13f9ec875c595398a1a42d009a4555",
+  "blk.3.ffn_norm.weight": "6880b1c27d46969ce36fac049c05dc8b89e4bb47dc89df357e32df7e18fc512e",
+  "blk.4.ffn_gate_exps.weight": "a883b4f225b760c5a2f6605dc5e2167ab85bb398c70bf64ceb539fcbd6128dcd",
+  "blk.4.ffn_down_exps.weight": "d291bb656aae77947d4b525e2819bf4112afece53ff31de9dab999af1f65f9c4",
+  "blk.4.ffn_up_exps.weight": "38592afb8ba3dcfb26970f906174f7d3fa62da44fa4be4fc6912a19030ea9164",
+  "blk.4.ffn_gate_inp.weight": "1596cb74e8fd6c3080b937b06468bb397b0dbb661e6d180a6bcbdc43e8bfd0c6",
+  "blk.4.attn_norm.weight": "f90c83c5ff4366281d283384efc941620542b9cfdea160d678dc54a75e33f758",
+  "blk.4.ffn_norm.weight": "d28d8c49d1746b7cc085562d1074905fd14023844de823dc4fb22202bb280790",
+  "blk.4.attn_k.weight": "792bbf412cc357140fdaba543e547a9b2f7582919e307bbd9a80c7d6d8f5f1f9",
+  "blk.4.attn_output.weight": "d98e4a062d2631d9c315f1990d5f6ca9a88e7e0e46387f611ccb0353f876aa12",
+  "blk.4.attn_q.weight": "1a11a55a91d9f748a72176ff6b1c174844df406e00d1b66b9aa64dc6ee4bcd1d",
+  "blk.4.attn_v.weight": "04cb3c02b12a6313c7ac7044513441083d534fb4c5a3f63bbaa58f7edbd2fadb",
+  "blk.5.ffn_gate_inp.weight": "cbd5cdf015d33a2da6703eb74c22fcb97581fb9175435173b6dc4f9e8364320d",
+  "blk.5.attn_k.weight": "4fdf3405e4d657403f5647b51233521310ee984b4b81bbcd901cb3e6ab76b7ff",
+  "blk.5.attn_output.weight": "4a25662c46979a29600ed77e1907cf81fb16ef30e724c155444e54ccb76af481",
+  "blk.5.attn_q.weight": "e2acb30e30b97300039bb20ad0878f05159d5657fa811748a51d5b6fb35d631e",
+  "blk.5.attn_v.weight": "306504b6a26aa123c63dbbed3f4ced0ed2ee8fb6a30bf0093539b817539f5ece",
+  "blk.5.ffn_gate_exps.weight": "7e34df9b9944dbeea5e8565786d3aa6937314a4b87acd4d0874687877c5a39fd",
+  "blk.5.ffn_down_exps.weight": "c4b7a57a42b5ac0a8ae27dcd5cb2646d7a7cc7123126d44a56ab128e85f60b13",
+  "blk.5.ffn_up_exps.weight": "09d47593b6dd6c664a9155bff02fc2eb7ac4a70219a88162d05c802a01d3c6ba",
+  "blk.5.attn_norm.weight": "58804a036d6ac4c1fe357b8b6a97a5c37cae1c2f06ee0086c041d449c1c6ef6a",
+  "blk.5.ffn_norm.weight": "d872dee6789f0826211aa46ca9d0869e3e96bcace9e77d6559a7b6f3e524f3ca",
+  "blk.6.ffn_gate_inp.weight": "fb1eae732e974d6c1d020a5b4ef98c5f33016f984701bcea656f999a99daad66",
+  "blk.6.attn_k.weight": "55e9c59c5051ab5519b3a7962e1b5fa96a3c0251cb6200dc2f177885ad2de470",
+  "blk.6.attn_output.weight": "f3c834a8d0027370350e2b6294d95434d31432e57be6313b013c15a56303d61c",
+  "blk.6.attn_q.weight": "efaefe5f11c2140dc7cb532b0832c2a0b363a165cbda21f00fadae77efca377b",
+  "blk.6.attn_v.weight": "900bd734d75616d846a90a121c97e081c956a3d1ab012f66dd0bc62c43e1ec3c",
+  "blk.6.ffn_gate_exps.weight": "312a99661b1468fcaed2474621116f1681432755e973f3ee79d01912974fd424",
+  "blk.6.ffn_down_exps.weight": "ac9cd7db67a2ef0d2b5def86873673d05e48d49d147dd944469dbb8e2d4c46f6",
+  "blk.6.ffn_up_exps.weight": "57613e7e09579400a1a09fee4445acfbfe83f2f327fdf317877787d96ada6b84",
+  "blk.6.attn_norm.weight": "0e8801e09885c633bc01a9a5b85d4e878d30158a4eb41a937dc5b760ebd044cb",
+  "blk.6.ffn_norm.weight": "b8c58062ac93072f878446b0e7f958c737aa47fb769fc3a8f593133d12db2dd1",
+  "blk.7.ffn_gate_exps.weight": "1ef611732ff13edfa8d30981ed9dac00c15ceba9fc012ed0b199e9280a849948",
+  "blk.7.ffn_down_exps.weight": "856c6811945c7b0fa461ca17811cfa43436b4cdf5326bad23cbc30883486d7cc",
+  "blk.7.ffn_up_exps.weight": "6725e3e33994302ee13fa5ec163631ce2dcaa08aadde8fc166c2265d4561c5c5",
+  "blk.7.ffn_gate_inp.weight": "36b49d7f80c1003dc392b2c1b9960cd49889dd69e77b26b9e4b13d01f3d0a32a",
+  "blk.7.attn_norm.weight": "7a0ec49acc5e20ee71c6f80ca02f4f1e564c485e0ae0621309e7c2eb0c616cf0",
+  "blk.7.ffn_norm.weight": "eeae035c39ab6e64bc06a4baa1bf6e50d4c8b8797cb0ad8abd48be86974802c0",
+  "blk.7.attn_k.weight": "e8f78c1def01a7a38d2d9bf7becb17755e28fefe4927856f7890fbee52840187",
+  "blk.7.attn_output.weight": "5367f05ac3bb49ef8745ba5902e1bdd4442415a3ebff2c7e1a3918d7be6fe948",
+  "blk.7.attn_q.weight": "37c95fc5acc55a4f6e5f02cab9be60e4fe54c08b65f98f4455741b4aa542ff4e",
+  "blk.7.attn_v.weight": "c89f1343486ba55814233511e94090f7365662a8a4214aa4c278cdadc79196c2",
+  "blk.8.ffn_gate_inp.weight": "4e239afe8c7afb8de3a005757c887cf14b1622ca2d224227591cb0e5301f4c17",
+  "blk.8.attn_k.weight": "2ad0229f30fdcc1e85ce64e00d8f75902238294844a81d5af43e14ba75c02983",
+  "blk.8.attn_output.weight": "2e44a4722acb3b521b81d0b910f8ca2f6c286d874a92ddd02150566454061699",
+  "blk.8.attn_q.weight": "1cd2b09cb2f43e08de776b5f7eac197a5a6d4ffdfd52b21baa36319450147bd0",
+  "blk.8.attn_v.weight": "5a22c57ebfd33ac500cbcfd321d5b5b1783f8728801db6f3f8bed51c7183e4db",
+  "blk.8.ffn_gate_exps.weight": "91063fe56cb4f3ff3b41052bb5046fcf8ef61516a603ee90aab893a9d68c15a7",
+  "blk.8.ffn_down_exps.weight": "d4c3abc8f1d1b462f67f70bd8f404b3fcf45dceeaa8527fa120527254c383c90",
+  "blk.8.ffn_up_exps.weight": "76a1a1f08ec577716a2e7027b45293e9205751126424f1bebe1de89c78f087d5",
+  "blk.8.attn_norm.weight": "f980d774da39eb76c52358afac3e38cb4c81cb323deaabbe5c41822e3f17a98e",
+  "blk.8.ffn_norm.weight": "1c937658cf90f1a85db9a5f26e077730fdd4b694607dbeeb825c5fb2bc407e0b",
+  "blk.9.ffn_gate_exps.weight": "a2532471ecb7896d5c78e5a34e10cfaf4125265e1595166c8d0d0dfbe2a3187f",
+  "blk.9.ffn_down_exps.weight": "b47921a28412d48fee450b8b9d97cee42344a2e69f06d407fd9523d7adf13333",
+  "blk.9.ffn_up_exps.weight": "7c461bd1b2a73b439cff6a10d94afa01e8b06f7e6f09d9a6f28e3876aef48bce",
+  "blk.9.ffn_gate_inp.weight": "1648dfb08b5c06d7953a5a97ecb764995fae9487fb729a1c867023b2538149d0",
+  "blk.9.attn_norm.weight": "8635db0f299882a63b7cfcd1d4259c9e53fab22c31d3d054de36b1001380b31b",
+  "blk.9.ffn_norm.weight": "f9309aa323062d174c463613afef9b0a33501b510bfaa58a8e0e866d12ffef3c",
+  "blk.9.attn_k.weight": "dfe62030441e947a588512d18d9c6e4ed72c2f71c227d622c095e4263b23dadf",
+  "blk.9.attn_output.weight": "1977beb75c6349c50ba7dd3865d7c0a9c5c5ddc854413147b0eec98ac4fda351",
+  "blk.9.attn_q.weight": "eb132596719605cd6bd1782487f121994629e115190edd69240b12af66e734f5",
+  "blk.9.attn_v.weight": "9e708f15d332d7c5187b0693b1a977eb30a2fa10bf7df48ed9d7537c0aa6ed99",
+  "blk.10.ffn_gate_inp.weight": "97503a5d166c1925f9b65c0eed980753d411714d66896f3d0fad5286c7aba702",
+  "blk.10.attn_k.weight": "1ebdd222336bd25b48df1b138cdbe09021c4a5562ea7cb78cadd1255d2be3a39",
+  "blk.10.attn_output.weight": "5e98faa38e9d514b9057e1c8342c509cbe1083defd518e506f6bad89117d1f5a",
+  "blk.10.attn_q.weight": "3323a26c87d936d1dd87c577d0b763459fced726679612c874b3de5fc6d969c5",
+  "blk.10.attn_v.weight": "d5fa73cb56aca388e205f44455e4b4f676fdc12ed7fac4542fbb3b41ecea59ad",
+  "blk.10.ffn_gate_exps.weight": "225021b53782800906cd13b70be3a4161e8b300b97f984a959ccad6a6e8adcbd",
+  "blk.10.ffn_down_exps.weight": "f08eb91526bd22f5fd0402fe925d6141cdbb308a1ced0330858d0c85c71f5ef3",
+  "blk.10.ffn_up_exps.weight": "a9f688350c3b53eaada5103b5848bd9a3d7d6b327a70fa16c24bf28ece933eac",
+  "blk.10.attn_norm.weight": "5ba426c9dfc79805015ccd76cd1068b0ad3bb7a8453e14bb1d35486f122d8f95",
+  "blk.10.ffn_norm.weight": "98891d6acbc3986b2581b7a3af9f5946a392d9188972c6a8b15d4e745a4f2482",
+  "blk.11.ffn_gate_inp.weight": "b2365a60566e7dace892e1cb0e62eb73ce387352601723e847052b34874feaa6",
+  "blk.11.attn_k.weight": "0efbc1d1430505543ff71532a4fcda821aeac616ef6c1dca40e00d4f2ff70bea",
+  "blk.11.attn_output.weight": "3d5bd4d9a41236f30d4293edb9ae27beaa113ffb31b4fbfadff3a4c370dfd3e6",
+  "blk.11.attn_q.weight": "aa11e9db14dd9c77951511443077c2a1a78070753d7bd3d9811038473f69e325",
+  "blk.11.attn_v.weight": "5adc567f377aa11d1763d35f50e53fb2896a8b03b623ac36acc45efa2486d512",
+  "blk.11.ffn_gate_exps.weight": "71d07d982aabfab9eed3c733d49c20f023bf475368fc71db5084d91beadc4b47",
+  "blk.11.ffn_down_exps.weight": "9a06e61461e48b3925a9f7d9cca634d048c8b62163d7bc5c43e35899f959319e",
+  "blk.11.ffn_up_exps.weight": "bc05494d0dcec61021b3ac0c5bc1bf502736cadf48224e213bc139d562699a89",
+  "blk.11.attn_norm.weight": "a5758a10bdd0404ae1470e8e9db903985d4d07f60553c5001a5e7b660d4f7ada",
+  "blk.11.ffn_norm.weight": "814ae037563aad3771787316bec4806c95bf6f5991dd6474b4b1e5cc13dc18ee",
+  "blk.12.ffn_gate_exps.weight": "3a68b831ba1606fb9ef6dffed4732032447ecef23ea563ff4e79317586c7eb49",
+  "blk.12.ffn_down_exps.weight": "268b25e13f4b7beab08686e83705a41b21d15251809ee4784526f78a580da829",
+  "blk.12.ffn_up_exps.weight": "9105751a5b5b42ca2614d0456f24f779d2e2ac8cdff0f96842aa7ae2b70f341e",
+  "blk.12.ffn_gate_inp.weight": "d0de1558cc1d458c5c504f63ddc59785c323df7330474bb0644c346104b40a3a",
+  "blk.12.attn_norm.weight": "859a4c8113678e2e202d10299850e0cfb52eb11ea50bcbf4fe3ff39bdd394154",
+  "blk.12.ffn_norm.weight": "7fbf4c459c1760218877e9ee3f5ad49e960956a4369bcfe96c143f04ff9ddf97",
+  "blk.12.attn_k.weight": "0a7e254fdf3730a57372b6ff421a613eabaea68cdefd64800857941411318374",
+  "blk.12.attn_output.weight": "ceb763fc15d88af149d8fb78e82db2b7dab3aeae584af8cf7611a12356a397e5",
+  "blk.12.attn_q.weight": "a43402d23c46cb2d3cb3c2a98c81b19d10026b7e6742370fed6b2880b6e049b5",
+  "blk.12.attn_v.weight": "3bc24f2c0480ce91ef72993ee8f1cf962f7359e12183424583ffa1246bf3db52",
+  "blk.13.ffn_gate_inp.weight": "a6d68c82bfe66d8bab68f980f5f18268a9e2c0cd6b8832ed39010e0de198ae05",
+  "blk.13.attn_k.weight": "0166c39546b37dc2e01b2b396ba43e183f797dd04eaa51a6d103d8b58ee4bace",
+  "blk.13.attn_output.weight": "2ce5eb198deab9557475a58b69b11e9874b547e05c23f223c6e42fa35ddca069",
+  "blk.13.attn_q.weight": "745c1bbdf434284a7fae98f45e821c076dd9c2a2467dba6a9d8cf0041e419dbc",
+  "blk.13.attn_v.weight": "9ece68d5ac64d1421ea7aa32e1cff9cc1fecf5175f4c4da858dd31d8633e3337",
+  "blk.13.ffn_gate_exps.weight": "ccfdcb4670b131689de12d396a010b5ea737795cf5c15a14a304d720b3c7c899",
+  "blk.13.ffn_down_exps.weight": "8b8fb328664764f1aaa5cbdec336d5654e981e965a02ef622bde5f07ea1c164d",
+  "blk.13.ffn_up_exps.weight": "d2ace0236c2fb3365fdc85499d676a7f65813c48e5085348b1df1799922766ec",
+  "blk.13.attn_norm.weight": "1ed29d7d89ce52d7cb4d57e895ff7115430466e917136c049c385c030ed44e9c",
+  "blk.13.ffn_norm.weight": "a194fc542597a4dcfdfaec5e3cba2a2b2b21b21edfc87c39c0d7f7651355bc4d",
+  "blk.14.ffn_gate_exps.weight": "a625e3574e5e740e7f8e2f9c40390f2f382c720aab5b10534e298002dd8d1fb9",
+  "blk.14.ffn_down_exps.weight": "bc366f015b83c865946afd74c8a884943e0ea2c671314a0b7bb72f21a44d2f78",
+  "blk.14.ffn_up_exps.weight": "ee3199bf2086de77b49f57f487676be8ee70e102a2fb5a5ef8ddbbc28a9eff41",
+  "blk.14.ffn_gate_inp.weight": "2b437870c850fa2e2044d032bb02908af634356e37466fdae260b933e48ee8b4",
+  "blk.14.attn_norm.weight": "cd8344d193a1cbd42bd898e17f4bcb1ca0b2918420fbdafa9249a6f2b7f4ae06",
+  "blk.14.ffn_norm.weight": "70eec40374e558fed5b07257283cf36342b6b0129285a00007deb59c32c9f7c8",
+  "blk.14.attn_k.weight": "4053bdb507e0543d724b632570bac86b31707696d90a0db44c49b2a082e0d599",
+  "blk.14.attn_output.weight": "0182632cb0e06a07241b8293d25d109fbc1862e1e337d435f908e8681e2eb1ab",
+  "blk.14.attn_q.weight": "ffc7794a4c1b6f793c842dba969435330a7a80b9212e457b4b2ac33e68b41241",
+  "blk.14.attn_v.weight": "6411805292d528e61bbaad8f9aab9dd073529a17946c057fb06864fad9cf3211",
+  "blk.15.ffn_gate_inp.weight": "77d0744567c76e6abb67f81ba9c715b2b544841186d5b948309571eff213bafb",
+  "blk.15.attn_k.weight": "1f7957954ea4c6521c257b35a360e868ffa02bdb3de91f146d5e06bb4a545c98",
+  "blk.15.attn_output.weight": "d7809d36bd8d3342240c46fd87bcc7f9821a222f48d9a95e45ae50460265d3cf",
+  "blk.15.attn_q.weight": "25f509313ae4d8401b871904059f472a26f5714e7c791c725de77a1a522c976e",
+  "blk.15.attn_v.weight": "96fedf5a591fc0f020e6de10fd72ff12b3ef9cf70cd21dabaa0d3e7b06f54e73",
+  "blk.15.ffn_gate_exps.weight": "8f950d976b2fd9a3d213b84123cf114c1377efde9352767fb2ddee89e177c8ef",
+  "blk.15.ffn_down_exps.weight": "6fd09d1557bb94b06efbd4f6a1ca4be532a202ba290e9315bc8da3d12a5c4c4a",
+  "blk.15.ffn_up_exps.weight": "cbeb59ae7b0266a928dc7e3a6e70a9330b92f9ee1b17ee1ed91022108204a33c",
+  "blk.15.attn_norm.weight": "2005330911ac2edc7b6d27aca021c67d30d16eb632e49b1a13f30fdb2717aed0",
+  "blk.15.ffn_norm.weight": "0e9198f3b548eb78acc8961f2b3350d238d26cec110933ba753a8cf0035c501c",
+  "blk.16.ffn_gate_inp.weight": "a41d1f99d739c8b150c3945b6949763988d0c6a4c5a2b5855592ca1a48ed23d5",
+  "blk.16.attn_k.weight": "b624e2ec88c2d3047f60530fb87e72cb4a5e655a9663f6f3e9b09e5ad32cddaa",
+  "blk.16.attn_output.weight": "687759ea75e45108526ffc1573d6fdf084728079bfc2dc89b9979e76280f43c4",
+  "blk.16.attn_q.weight": "beff3a45c7e9ec82ffc6d3c701126be28654d10aabd747d03441210491fd31b6",
+  "blk.16.attn_v.weight": "43a349b13f0b9d040cacecd942bcb168c030fef8c75c987d59a4fce6c14e855b",
+  "blk.16.ffn_gate_exps.weight": "793406d6c13d727c82bb7b692ca98d65ca975baee69fc57be5378d77c5a19b62",
+  "blk.16.ffn_down_exps.weight": "9bad3dd150d0230404b7f886ac7ff8803225757e813f195cdb26bad245243b4d",
+  "blk.16.ffn_up_exps.weight": "7449d663023fea3496475bf0a9c1de7272ad0ce9adcb3265e8e424badaa674dc",
+  "blk.16.attn_norm.weight": "a424ce34c195a401df1ce37ac4f2794e8a6720b1ee8acb21428e2b68c65e0125",
+  "blk.16.ffn_norm.weight": "405a68bb8e16e1064df2de55ca3cd9ceddda1d9fc0af007a9bd7cad4b2676248",
+  "blk.17.ffn_gate_exps.weight": "97c6e5321491ca5dc039ee88da0eb0e78f347372785411809af84b3298cb19dd",
+  "blk.17.ffn_down_exps.weight": "1617ac19788a1be19bac69277408761e6bdf5719d63a8c7fea14d41cc27641b5",
+  "blk.17.ffn_up_exps.weight": "4ead1c365f112581c10610ea3f63d2a1474311d2503d2060fed4b458ef337f5d",
+  "blk.17.ffn_gate_inp.weight": "ed4b3393f2523f2b5e0fc7680a1caa2842e605728a529b5af68a7fa8d7abf940",
+  "blk.17.attn_norm.weight": "beac17ef86a7fb2b5840cc72f7a95a5e3d6bd24e7fa698e0b0ebb9bdac45c561",
+  "blk.17.ffn_norm.weight": "81cb58ec6d6dc02a0b4ede10adc336dc865fa76f982d4eab0e4a37b40f5b0fac",
+  "blk.17.attn_k.weight": "eab569e5ea8c8b05e5a6a209fba031129453c2e28181eee3e736b3b04b36bbec",
+  "blk.17.attn_output.weight": "f85b70f01438ce8fe5d10599b113f30bf18dee2bbae0657d3eba295870001db3",
+  "blk.17.attn_q.weight": "887ceebfbf6a2b94b43d2df4439ac3a5bbc29311d4b28addc04d525546032047",
+  "blk.17.attn_v.weight": "2df9414d65014c06a93da22ba3a668be7b83e2e8008e98d7771f7dfebed98298",
+  "blk.18.ffn_gate_inp.weight": "9b07741a0950fc667e5fd25937e33bc22e1f764f80eb4ff3119f005327ae0f6e",
+  "blk.18.attn_k.weight": "8649598dbb63938744c39bcda5ce8c31773e29c573be8d4d2c114f5030f8d3e8",
+  "blk.18.attn_output.weight": "f8e391adb92622298ca834d5d1eda48b69c3b1c51c5a584ef6c54a725c298d75",
+  "blk.18.attn_q.weight": "84bf8708a2eed618f48f69c178ed7dd11fa4c468102376e72e910ebd037d131f",
+  "blk.18.attn_v.weight": "31db3cd773f09548c2c1b1eac2718e46364a7810970fe9c433fad9d8de5397eb",
+  "blk.18.ffn_gate_exps.weight": "be2a2ba378002f1b61f86c273a69eede9b93786d5ce96b4fee1861f730dca4c4",
+  "blk.18.ffn_down_exps.weight": "d35196159e37705db50a5343e3989f7335477f1a4add67ef42ad64a638cd07ae",
+  "blk.18.ffn_up_exps.weight": "c6ceedd86e97913a6dcadc838e7abb762d629fb8dd55f15cf02fd9bd66d2ba78",
+  "blk.18.attn_norm.weight": "41f0b1ad83d6e3cb9fbe0d27878c2e7ad4a351b9f554a6bc9117c01745cdf6e5",
+  "blk.18.ffn_norm.weight": "96646204bd0d82f25dc77faba4dbd86b1332e449313e6684e00122da8be99057",
+  "blk.19.ffn_gate_exps.weight": "c6eb7f61e7938bda0492dbc05e51e8f631c99224fe18e99861fc4fc53ba9e9ff",
+  "blk.19.ffn_down_exps.weight": "4384803da3a3a3d44120d7dd192fe2c9bbd9a1a0cb492dbec1fdd7565230f1e8",
+  "blk.19.ffn_up_exps.weight": "22d73de2fbb8bb0f1bd2caf17fad8a355c47d914143f7f6e6d0128f66f074a60",
+  "blk.19.ffn_gate_inp.weight": "9a0cc4a2301a5634022fbce41189021bf0d1a961792d2d9330fd35556d18e5bd",
+  "blk.19.attn_norm.weight": "c5cc56ec5df9a1f7d5ad71fbda49f1433132e58895d45cb44c73420bd61ebd6b",
+  "blk.19.ffn_norm.weight": "77e17de741742ef2482fc7872fd423c8e3c1454dc4d2be89ee939084b6d78bc0",
+  "blk.19.attn_k.weight": "a92ea36ce2e3569656306aeefb835ccd5d1b03b33a86e0d3d030644cc923b813",
+  "blk.19.attn_output.weight": "5e2a912b37855f84ea964907a1a86d609cbdd79efa0c93c3e8e2fc07caf7c226",
+  "blk.19.attn_q.weight": "4ef3a5913292ac3c1a6fd3e9e53d011021f2b41d0276cf849706d1ca925cf7a7",
+  "blk.19.attn_v.weight": "42981b75b68ae852cee638b5433605c147da4392aaa6d7a06e756115b0171f39",
+  "blk.20.ffn_gate_inp.weight": "71381b9879a7c80b9f7b475abc0aa31b8cd71ccc00856ebe89764a2acb9df2dc",
+  "blk.20.attn_k.weight": "1928b7ebc054eb3967929ed6fb446314d5352f4aaf8b475ce55c6345019f2ea4",
+  "blk.20.attn_output.weight": "6071ecd9ca91af0d2ba93fef4a1a56f3b243dd70f862a21a2d164d56f386043b",
+  "blk.20.attn_q.weight": "002e95042a40f36ceed5829e3d0c8072e5f5e4ee86a089e2902b2348fed24dd5",
+  "blk.20.attn_v.weight": "42f509cdb1c0e298f89f896e349be86952c5168e49b3f83bb17badbcb7596d57",
+  "blk.20.ffn_gate_exps.weight": "a684a3ffe4b0a57c819a5fa9cb3521de223f392732927271e97ce925b6e33765",
+  "blk.20.ffn_down_exps.weight": "e3081a7bc7ba750d8a4886bc8ca4f231b55db4ca082b54b4106c7531964725cb",
+  "blk.20.ffn_up_exps.weight": "fad0fd5eca36ab154788da28be8ec25bb5d6db06c9d133db89e96df358a2f6a2",
+  "blk.20.attn_norm.weight": "c3e3f2429715ae95e884ef1246b0b461b23c5cc0ed08beecf70a14cddd184820",
+  "blk.20.ffn_norm.weight": "ff31f609dda65ca496b0584fabea6550e42edd05ebf229812aa6b7bb5ede15e6",
+  "blk.21.ffn_gate_exps.weight": "366f09ef0ecfb86808eb3296cc9abdb957951d27f6533c03f1422b54061da660",
+  "blk.21.ffn_down_exps.weight": "3fc495947d27fcca7fc0893c8a96e5d48ba27b2c8c58f8fcfb8dcfcd5539741c",
+  "blk.21.ffn_up_exps.weight": "6713ed51410bcc8283cbb001c4ad784098f25701e8021f4fa4f411e186859c4a",
+  "blk.21.ffn_gate_inp.weight": "6d4c92c01ec801647134d907bf1108878156df266a6107abc10526332b328b93",
+  "blk.21.attn_norm.weight": "27605719ae2df24f4f2e85a730927cab20367631612cb501631f6bbf38eb1209",
+  "blk.21.ffn_norm.weight": "ca80ee8177db185b15a4a378c1cb6f7143c76546a7f1726bda23f329323d4ffa",
+  "blk.21.attn_k.weight": "9e49f743d4a5bda9b4bd9c40c2ca37cdae5aec7e54cb193897ac8b4945ada14d",
+  "blk.21.attn_output.weight": "ab923540879753feaed152f5950f69cdd83d8f2413ca873f5f038b63ab0aea12",
+  "blk.21.attn_q.weight": "62617fc3f1c9d2aa672a4d91a121c7a91b92d145b65e75f0b06b4bb7c825dc36",
+  "blk.21.attn_v.weight": "15f8b2e72f8e8e992f2f6b3e93238a9d7be7bd6136f91c9d04b4b4cd0cd60369",
+  "blk.22.ffn_gate_inp.weight": "3ddb1773d9257b68add7a2a4e94dad25ed926803e02707863dd742ab9b2dc179",
+  "blk.22.attn_k.weight": "680e45a9e8d5feddee5266e119dc053bf80718fa9af1cf6803e6f493b265f1eb",
+  "blk.22.attn_output.weight": "0d5fae3402fb2c5aa3a860010e3973fc8e3168d1015f7a76b7b2964681693206",
+  "blk.22.attn_q.weight": "eee7e3d426ab533bd18d62c9aa142eedbde394bed07db58313e0fccc82a23237",
+  "blk.22.attn_v.weight": "26b5be1fe3c2b6824c5a648a3e4bdf17691904526fca158fbc3ebb627b67e2f4",
+  "blk.22.ffn_gate_exps.weight": "32ab7a7735313d60f6a75229b1aeee940b6aee176c9648536bf5921b0dc2929a",
+  "blk.22.ffn_down_exps.weight": "67590808f6a67777d3eb7976c31fe616d388b98fecbb12253b72d1241d70753f",
+  "blk.22.ffn_up_exps.weight": "fc245c0183e6d90829ff5e71a4ec93e4860b3d4c1a17b9dda2fb64f5f5c9ed32",
+  "blk.22.attn_norm.weight": "128e99d206d4d6724758ec97468af767fa0aea592149c324b731659c1e74a1a8",
+  "blk.22.ffn_norm.weight": "e45f498033f0cffa15da0eff2c47b4472e43fcf8921729fc4eeb2e3a6b3c78e2",
+  "blk.23.ffn_gate_inp.weight": "d63e686f5325fbc89fa242c2c52a3b8ff54f867dca914c9ae6eea13e9d6f46e5",
+  "blk.23.attn_k.weight": "f71f5a577f46ea12b1818f3a5ff4b85ddc45f9a2afb0fa2e041d71a3e31c6779",
+  "blk.23.attn_output.weight": "92b13563c1e0eac0d748fb67b235dfd7a64c8f16e2dafb316885744582e23b4b",
+  "blk.23.attn_q.weight": "2f9b9c35dc4f912f3f51c06e2d68f417b51a0de0a84aac530a64f9d3d7b0a2dd",
+  "blk.23.attn_v.weight": "268e40813806e74a5c364b19556d087bf8374e76e7b6fcf55c381eb7da13ccd1",
+  "blk.23.ffn_gate_exps.weight": "12f857e7a7ce228afac34d99b602c8d6fe96984f2a21118f459a58cb767ee65e",
+  "blk.23.ffn_down_exps.weight": "cdb082c16599c3bb36a28066dcc122d9529b54fa91b6cf0153437ec960a5e16d",
+  "blk.23.ffn_up_exps.weight": "f4b99f6f44d7b8b5a305894e88633bf5938fc1f6303a2b2092399da9c8b64d7c",
+  "blk.23.attn_norm.weight": "a691392210383915916b4d3886d5e4d56e7855e27e37e414fbd73bf66b3712e6",
+  "blk.23.ffn_norm.weight": "0c3dc72f667e5ae19b69bfa9f2bd2a01a57681f89ef9527bad4eb0d8c7b70da8",
+  "blk.24.ffn_gate_exps.weight": "86baca2a3157994df7fd8ced5e08436d5c1810dc29c0715637c36de723e0e7d1",
+  "blk.24.ffn_down_exps.weight": "ac5d559562b35c34993e34b071f66d15c65be5907797078c2d2a49aba54e3192",
+  "blk.24.ffn_up_exps.weight": "fce0a099cf09777f44fbab3606ceb75f7fae6f0b80725f9e871654b8cdf9262a",
+  "blk.24.ffn_gate_inp.weight": "e7c6800c0cfc56b565b2d35ad6f1dbfdb70dd0b05b338bc8da2286ffc3678d79",
+  "blk.24.attn_norm.weight": "dc6cc18ec52d102d015153c4a1132f9d7a504e29cbdec81c5edbf3b9e65815e1",
+  "blk.24.ffn_norm.weight": "480d5a1397af5e0e657f1e67d20ec0cdef5724e71246a326843321b87ffabd33",
+  "blk.24.attn_k.weight": "338c0597954a9b95a782545b2fe36469553e73f86ae2d2b5697767b28e1c7daa",
+  "blk.24.attn_output.weight": "a77d23b79933c67e52f1eef7f83a3dff4f767ce0bbcc39572f8cec4acd457643",
+  "blk.24.attn_q.weight": "45c9478593002be1998e96e70668aafa2dd3972380fbc1df12fb05c24ba959e0",
+  "blk.24.attn_v.weight": "515729420885408a6a9614bc27cda393ed907521318d14d21335d39a3eff0b61",
+  "blk.25.ffn_gate_inp.weight": "aae4ac40e9ab3925241f9d784b54b38851d9bc999a6c3bc03fc3f17c9b28a67c",
+  "blk.25.attn_k.weight": "4ab4808d02396c35b00b426f536015673b71c17ae6cd55bbc2e6bfe7a4c59d0c",
+  "blk.25.attn_output.weight": "1990bb982b77e0c947cd1a8ef0b36227ee1259e6dbbc2829e5c136edf88675eb",
+  "blk.25.attn_q.weight": "a1490f3048e8c0ec8784f8550c43adf5cc8d0f2f90131c934713fe4b1b015bd7",
+  "blk.25.attn_v.weight": "f15e53c6d45b3b6f58808fa968425d65e0b26b7f9b268127a77abb1227c67431",
+  "blk.25.ffn_gate_exps.weight": "656662447ff54f56ee80f78a1b9483f7efdc40f7375d0cd8a9c72ccf21f77e7b",
+  "blk.25.ffn_down_exps.weight": "db06f101bccbaef19cced0f6c185166e18202465f4a42cddfd535fbe5cbabb4a",
+  "blk.25.ffn_up_exps.weight": "584a7b02456f27fe1d8d3c7ccd21d426b6ea887795a3ed77f704596a1e3841d7",
+  "blk.25.attn_norm.weight": "8f0f3597982930fd237e9d609776c64f2b909a455b21678f83a7ebd4bbb83e64",
+  "blk.25.ffn_norm.weight": "3e7079c32582afba0c55e032f254adc18d2997705eec860185e9a6dd3d82f07e",
+  "blk.26.ffn_gate_exps.weight": "e70341691b583b86489812b29b77aa41eb658b1865733d6118da54c66e3bfcc6",
+  "blk.26.ffn_down_exps.weight": "5c1b812d11dfb064af816ced5ab6463bf9722eefdfc341b8a93705d5038fd781",
+  "blk.26.ffn_up_exps.weight": "e18118362ae54ef7432781c83884f9fb230a9d934e342aabeda8822ea5f71fb6",
+  "blk.26.ffn_gate_inp.weight": "cd1c5f6710166b9567c6b74c97b2348b191c60aa860958c6bc264ab095261dff",
+  "blk.26.attn_norm.weight": "71d087531af2520bda2e676c489e8529cef5db8aeea1eec0a937a8b4f2fa2e54",
+  "blk.26.ffn_norm.weight": "7f704e936fda28eb5c2cc339f0f6a5f78170b5aa43c01265b21668870d819c82",
+  "blk.26.attn_k.weight": "1cc62a0ce0ae251275d898c52c4a9fba5995fca10955d2011d10dd1a59e1afb8",
+  "blk.26.attn_output.weight": "636e881b1505f9cef656a4be98bec6a4765321d51f9bf1dac8933397cf44b765",
+  "blk.26.attn_q.weight": "89a3c4d202d7d6adebb9e0c1bcfd8b775f6456386f1be25e86e43acc949c1e16",
+  "blk.26.attn_v.weight": "ff2cc963b597cdf1a21703f3e7022af3bb4c65a34a19e19d9309a7c5e198b5bd",
+  "blk.27.ffn_gate_inp.weight": "6150139498fefe380bb99d11e72028da47a15ecb73dfc5b2774f726f4bed8f9e",
+  "blk.27.attn_k.weight": "f286eb9e5c56c7b801a497aedc40158c2a27877d7f9fb59b3fc67834798902d2",
+  "blk.27.attn_output.weight": "5dc3d3a05f9f7729509147fd09c16fb53f85f520cdab5cb69abf4bae3fd460c7",
+  "blk.27.attn_q.weight": "8462e40f86b24251960d6f35a9ea99b8793a01937faf1aec2859f2e5395dbb61",
+  "blk.27.attn_v.weight": "bac1a99e38e25953f8315f7212eb9777dc216cadb09b959977885ae62724ceca",
+  "blk.27.ffn_gate_exps.weight": "6a15eca7f0f6ecfd93db2e55c63875348ec4a78c4ff643ec46df9e958c0101e4",
+  "blk.27.ffn_down_exps.weight": "2e1c91247c4359e2073a8e5f26fd7f6426da7be3ed5bc65dcfff701f0a5022b2",
+  "blk.27.ffn_up_exps.weight": "65d6f5c553c9332085eae4aeadf25090b5d7768212ea7b08ed698102c21b29a1",
+  "blk.27.attn_norm.weight": "7fab8ae63ec8e91ce625cd130ab96d8427dad3a7413bb21b25ec5f408c5b9f5a",
+  "blk.27.ffn_norm.weight": "532720546b0fdcd423a02ca6e3e9d8aacb84b1b3e8269968f88a47fe2a69bab4",
+  "blk.28.ffn_gate_inp.weight": "a305ea58d98962d9dcf0c53ad2389b7acc8936fb35a0e3fc9410e7767cd49dea",
+  "blk.28.attn_k.weight": "8315e8a2e4f78dfdf36d4fc18fffc74bc95fe42c3ae4f9af2b6c874612c0f71b",
+  "blk.28.attn_output.weight": "9b5fdedd32d39ef46a22cca7cd5355d7b93bd07ea305f466a8aad6ca5a4f3778",
+  "blk.28.attn_q.weight": "4e8fb96997c30e231c437130f410d7c91d541a816f6c568b5f3bfdb4b8dece74",
+  "blk.28.attn_v.weight": "1fec739cf3bd7b4913f72ca358d4cf31391c304de44ac0ae31ecb825beaa7cfd",
+  "blk.28.ffn_gate_exps.weight": "9f259789d535e09268266b9a8020f32d6a6779966c909d91d3a10574f06238a2",
+  "blk.28.ffn_down_exps.weight": "516d3f8abaedb01b9916a4b67d4672159769138ef2850158bc1b32c41e31f0e8",
+  "blk.28.ffn_up_exps.weight": "f2f1d88d2c31ed588806fb5ad981d68f5134d7284c4fc022fd018de2eef437fc",
+  "blk.28.attn_norm.weight": "960fd005598deadaebd969996f4367a9dbfad90539a863674fe95730935acc64",
+  "blk.28.ffn_norm.weight": "e1993b37ced93d4049e9af2c47b0d9207d8f7e6f2cc3a52f57bef30bc806d805",
+  "blk.29.ffn_gate_exps.weight": "58927146338f443513337476b3cd30e6341742f096c2beb5890d400f10121298",
+  "blk.29.ffn_down_exps.weight": "03a3386e4f0b75a28c5608e23b2de8f0de25f21954e4aa7fc343431bde9db07e",
+  "blk.29.ffn_up_exps.weight": "6916b7490a7ae7b04a5d81cc1e7ac9b20c483434f3b186b12d87fe176bf1567b",
+  "blk.29.ffn_gate_inp.weight": "98e710e467a3d567abe4ce29d78b8e8dc033148762290c0c5e1ae4d78efd8c78",
+  "blk.29.attn_norm.weight": "4e64cb307d37be20d55f38c94faf7e451d11df5e60df347906cbaf9c5441be71",
+  "blk.29.ffn_norm.weight": "696c23a52f742679bd44440d687a4c44b4302d57f1e9dc5610d23374336187e7",
+  "blk.29.attn_k.weight": "e85253652fd6120c623634ba66b725bf7cd491318b54ccdad2c7df8851d64c0a",
+  "blk.29.attn_output.weight": "4f650a71efb150d1f24cd4d114d4187bf570ac424da3b92ea6455abdf1aea705",
+  "blk.29.attn_q.weight": "69fa7da901026ebcbbbc848455b425458b7e3295007d7fc093acf4b38e2166ea",
+  "blk.29.attn_v.weight": "17e2e7590b317b21f106de546aafd955579703d1e95d6aea044ee72ec3a514c9",
+  "blk.30.ffn_gate_inp.weight": "3a03284b4aa60d59d4a2ec86253469b61fc656372afca427cb77a5332fbcc62c",
+  "blk.30.attn_k.weight": "d518cfd0db9708e769eb1399e87ee49357dc54d5afdbac3d4c0ca46c64e789eb",
+  "blk.30.attn_output.weight": "9b44378714d784c5ef9ab604359091baca4e0ec222afa139b7f840eaefb371fd",
+  "blk.30.attn_q.weight": "cbb95365bbfbcad0c9cd99b4eebb5a5d32de68ce08e4063b5ec3e792b7548044",
+  "blk.30.attn_v.weight": "e7985c04fe1740e35a9598f43b67b0922b4fc2d00b68a92a9f917b82c3248de1",
+  "blk.30.ffn_gate_exps.weight": "8ac4bbd07935d98f895ba94dc174e5ad5046c3c222b53729d60f987c05e7eb70",
+  "blk.30.ffn_down_exps.weight": "dd672cc71e82abf05064a18121b8e55fe1a4f19bc1d7cb9a142f4add54bc336e",
+  "blk.30.ffn_up_exps.weight": "12282f664a2a12aa25e2deac58946108715ebb978bafed5274cef24569107646",
+  "blk.30.attn_norm.weight": "1a33458fee054c6c9c896a4bb0a4e1fbfa0293b2408c7dd2b81d692e966e7273",
+  "blk.30.ffn_norm.weight": "311e33b68051f507f1478ed8f2693fddb846170ddb7285a91be43f795c2ce31e",
+  "blk.31.ffn_gate_exps.weight": "8af43d9867a51cd8392fb48b981b0ceee0ae979c491c07d711b3b56b5162c786",
+  "blk.31.ffn_down_exps.weight": "5579cb7758c1600b19d1f540deffe081b575962e37437b3b2efb2fb0a2924e40",
+  "blk.31.ffn_up_exps.weight": "f2e7c005276b3a001fb40753f027fa10b4d5a346f43cf4b4bbdeec6e74e1cf6a",
+  "blk.31.ffn_gate_inp.weight": "89885dc0e30b6b16a90c0331d7fa3174671e941364e8102d934f02132237e61b",
+  "blk.31.attn_norm.weight": "99e4e9bf86a9edf8c404153a7e8a82324ba79da462622196e2faba161bd95172",
+  "blk.31.ffn_norm.weight": "55335997cf6de781bf332b943de96ff4646966b05d9fee86b76ea897e27b6ca7",
+  "blk.31.attn_k.weight": "cee570762b78da6316b637892cc4b080e40f57af5551ffb1866b9a8e80e96628",
+  "blk.31.attn_output.weight": "fa321ff55ec7819ead7b819fd45215262f39744569765ba2113c989c03588802",
+  "blk.31.attn_q.weight": "9e2c409b878f8a2a1436874abf428fceb1c534b21f9ad4dd6f532b8a469007f0",
+  "blk.31.attn_v.weight": "a845d0be68ba537b4a775bfba4d897faf7c82a811a2612b0b7420cc4f3574cb8",
+  "output.weight": "16101cbb74b54cda9ebc07ca3c762e3263a56efb3cc011156184b95807d7cf13",
+  "output_norm.weight": "d7aa61585baedd60157aafe157930785742c55989c288573566a971b02423564"
+}
diff --git a/convert/tokenizer.go b/convert/tokenizer.go
index fd6df5f5..baee04aa 100644
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -3,19 +3,148 @@ package convert
 import (
 	"cmp"
 	"crypto/sha256"
+	"encoding/hex"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"log/slog"
 	"os"
+	"path/filepath"
 	"slices"
+)
 
-	"golang.org/x/exp/maps"
+const (
+	_ int32 = iota
+	tokenTypeNormal
+	tokenTypeUnknown
+	tokenTypeControl
+	tokenTypeUserDefined
+	tokenTypeUnused
+	tokenTypeByte
 )
 
 type Tokenizer struct {
-	Version     string         `json:"version"`
-	AddedTokens []Token        `json:"added_tokens"`
-	Model       TokenizerModel `json:"model"`
+	*Vocabulary
+	SpecialVocabulary []*SpecialVocabulary
+	Merges            []string
+
+	Pre      string
+	Template string
+}
+
+func parseTokenizer(d string, specialTypes []string) (*Tokenizer, error) {
+	v, err := parseVocabulary(d)
+	if err != nil {
+		return nil, err
+	}
+
+	t := &Tokenizer{
+		Vocabulary: v,
+		Pre:        "default",
+	}
+
+	addedTokens := make(map[string]token)
+	if f, err := os.Open(filepath.Join(d, "tokenizer.json")); errors.Is(err, os.ErrNotExist) {
+	} else if err != nil {
+		return nil, err
+	} else {
+		defer f.Close()
+
+		var tt tokenizer
+		if err := json.NewDecoder(f).Decode(&tt); err != nil {
+			return nil, err
+		}
+
+		for _, t := range tt.AddedTokens {
+			addedTokens[t.Content] = t
+		}
+
+		t.Merges = tt.Model.Merges
+
+		sha256sum := sha256.New()
+		for _, pt := range tt.PreTokenizer.PreTokenizers {
+			switch pt.Type {
+			case "Split":
+				if pt.Pattern.Regex != "" {
+					sha256sum.Write([]byte(pt.Pattern.Regex))
+				}
+			}
+		}
+
+		switch digest := hex.EncodeToString(sha256sum.Sum(nil)); digest {
+		case "d98f9631be1e9607a9848c26c1f9eac1aa9fc21ac6ba82a2fc0741af9780a48f":
+			t.Pre = "llama-bpe"
+		case "03df5c5863ad70781dcfdef491ead25140f895fe8010964be0daefe27be32b02":
+			t.Pre = "deepseek-llm"
+		case "21cde974d587f0d54dc8d56b183cc1e6239600172035c68fbd6d4b9f8da0576e":
+			t.Pre = "deepseek-coder"
+		case "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855":
+			// noop, empty pretokenizer
+		default:
+			slog.Warn("unknown pretokenizer, using default", "digest", digest)
+		}
+	}
+
+	if f, err := os.Open(filepath.Join(d, "tokenizer_config.json")); errors.Is(err, os.ErrNotExist) {
+	} else if err != nil {
+		return nil, err
+	} else {
+		defer f.Close()
+
+		var p map[string]json.RawMessage
+		if err := json.NewDecoder(f).Decode(&p); err != nil {
+			return nil, err
+		}
+
+		if template, ok := p["chat_template"]; ok {
+			if err := json.Unmarshal(template, &t.Template); err != nil {
+				return nil, err
+			}
+		}
+
+		for _, st := range specialTypes {
+			sv := SpecialVocabulary{Type: st}
+			if bts, ok := p[fmt.Sprintf("add_%s_token", st)]; ok {
+				if err := json.Unmarshal(bts, &sv.AddToken); err != nil {
+					return nil, err
+				}
+			}
+
+			if bts, ok := p[fmt.Sprintf("%s_token", st)]; ok {
+				var content string
+				if err := json.Unmarshal(bts, &content); err != nil {
+					var mm map[string]any
+					if err := json.Unmarshal(bts, &mm); err != nil {
+						continue
+					}
+
+					content, ok = mm["content"].(string)
+					if !ok {
+						continue
+					}
+				}
+
+				sv.Content = content
+			}
+
+			if id, ok := addedTokens[sv.Content]; ok {
+				sv.ID = id.ID
+				t.SpecialVocabulary = append(t.SpecialVocabulary, &sv)
+			}
+		}
+	}
+
+	return t, nil
+}
+
+type tokenizer struct {
+	Version     string  `json:"version"`
+	AddedTokens []token `json:"added_tokens"`
+	Model       struct {
+		Type   string         `json:"type"`
+		Vocab  map[string]int `json:"vocab"`
+		Merges []string       `json:"merges"`
+	} `json:"model"`
 
 	PreTokenizer struct {
 		PreTokenizers []struct {
@@ -27,80 +156,106 @@ type Tokenizer struct {
 	} `json:"pre_tokenizer"`
 }
 
-type TokenizerModel struct {
-	Type   string         `json:"type"`
-	Vocab  map[string]int `json:"vocab"`
-	Merges []string       `json:"merges"`
-	Tokens []Token
-}
-
-type Token struct {
+type token struct {
 	ID          int    `json:"id"`
 	Content     string `json:"content"`
 	Special     bool   `json:"special"`
 	UserDefined bool
 }
 
-func (t *Token) Type() int32 {
-	switch {
-	case t.Special:
-		return tokenTypeControl
-	case t.UserDefined:
-		return tokenTypeUserDefined
-	default:
-		return tokenTypeNormal
-	}
+type Vocabulary struct {
+	Model  string
+	Tokens []string
+	Scores []float32
+	Types  []int32
 }
 
-func (t *Tokenizer) maxID() int {
-	return max(
-		slices.Max(maps.Values(t.Model.Vocab)),
-		slices.MaxFunc(t.AddedTokens, func(a, b Token) int {
-			return cmp.Compare(a.ID, b.ID)
-		}).ID,
-	)
-}
-
-func parseTokens(dirpath string) (pre string, tokens []Token, merges []string, err error) {
-	f, err := os.Open(dirpath)
+func parseVocabularyFromTokenizer(p string) (*Vocabulary, error) {
+	f, err := os.Open(filepath.Join(p, "tokenizer.json"))
 	if err != nil {
-		panic(err)
+		return nil, err
 	}
 	defer f.Close()
 
-	var t Tokenizer
+	var t tokenizer
 	if err := json.NewDecoder(f).Decode(&t); err != nil {
-		return "", nil, nil, err
+		return nil, err
 	}
 
-	tokens = make([]Token, t.maxID()+1)
+	var tokens []token
 	for k, v := range t.Model.Vocab {
-		tokens[v] = Token{ID: v, Content: k, Special: false, UserDefined: false}
+		tokens = append(tokens, token{
+			ID:      v,
+			Content: k,
+		})
 	}
 
-	for _, v := range t.AddedTokens {
-		v.UserDefined = true
-		tokens[v.ID] = v
+	for _, t := range t.AddedTokens {
+		t.UserDefined = true
+		tokens = append(tokens, t)
 	}
 
-	sha256sum := sha256.New()
-	for _, pt := range t.PreTokenizer.PreTokenizers {
-		if pt.Type == "Split" && pt.Pattern.Regex != "" {
-			sha256sum.Write([]byte(pt.Pattern.Regex))
+	slices.SortFunc(tokens, func(i, j token) int {
+		return cmp.Compare(i.ID, j.ID)
+	})
+
+	v := Vocabulary{Model: "gpt2"}
+	for _, t := range tokens {
+		v.Tokens = append(v.Tokens, t.Content)
+		v.Scores = append(v.Scores, float32(t.ID))
+
+		switch {
+		case t.Special:
+			v.Types = append(v.Types, tokenTypeControl)
+		case t.UserDefined:
+			v.Types = append(v.Types, tokenTypeUserDefined)
+		default:
+			v.Types = append(v.Types, tokenTypeNormal)
 		}
 	}
 
-	switch digest := fmt.Sprintf("%x", sha256sum.Sum(nil)); digest {
-	case "d98f9631be1e9607a9848c26c1f9eac1aa9fc21ac6ba82a2fc0741af9780a48f":
-		pre = "llama-bpe"
-	case "03df5c5863ad70781dcfdef491ead25140f895fe8010964be0daefe27be32b02":
-		pre = "deepseek-llm"
-	case "21cde974d587f0d54dc8d56b183cc1e6239600172035c68fbd6d4b9f8da0576e":
-		pre = "deepseek-coder"
-	default:
-		slog.Warn("unknown pretokenizer, using default", "digest", digest)
-		pre = "default"
+	return &v, nil
+}
+
+func parseVocabulary(d string) (*Vocabulary, error) {
+	patterns := map[string]func(string) (*Vocabulary, error){
+		"tokenizer.model": parseSentencePiece,
+		"tokenizer.json":  parseVocabularyFromTokenizer,
 	}
 
-	return pre, tokens, t.Model.Merges, nil
+	for pattern, parseFn := range patterns {
+		matches, err := filepath.Glob(filepath.Join(d, pattern))
+		if err != nil {
+			return nil, err
+		}
+
+		if len(matches) > 0 {
+			return parseFn(d)
+		}
+	}
+
+	return nil, errors.New("unknown tensor format")
+}
+
+type SpecialVocabulary struct {
+	Type     string
+	ID       int
+	Content  string
+	AddToken bool
+}
+
+func (sv SpecialVocabulary) Key() string {
+	switch t := sv.Type; t {
+	case "bos", "eos", "cls", "mask":
+		return t
+	case "unk":
+		return "unknown"
+	case "sep":
+		//nolint:misspell // this is an upstream typo
+		return "seperator"
+	case "pad":
+		return "padding"
+	}
+
+	panic("unknown special vocabulary type")
 }
diff --git a/convert/tokenizer_spm.go b/convert/tokenizer_spm.go
new file mode 100644
index 00000000..75d9fe26
--- /dev/null
+++ b/convert/tokenizer_spm.go
@@ -0,0 +1,83 @@
+package convert
+
+import (
+	"cmp"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"slices"
+
+	"google.golang.org/protobuf/proto"
+
+	"github.com/ollama/ollama/convert/sentencepiece"
+)
+
+func parseSentencePiece(d string) (*Vocabulary, error) {
+	bts, err := os.ReadFile(filepath.Join(d, "tokenizer.model"))
+	if err != nil {
+		return nil, err
+	}
+
+	var spm sentencepiece.ModelProto
+	if err := proto.Unmarshal(bts, &spm); err != nil {
+		return nil, err
+	}
+
+	v := Vocabulary{Model: "llama"}
+	for _, piece := range spm.GetPieces() {
+		v.Tokens = append(v.Tokens, piece.GetPiece())
+		v.Scores = append(v.Scores, piece.GetScore())
+
+		switch t := piece.GetType(); t {
+		case sentencepiece.ModelProto_SentencePiece_UNKNOWN,
+			sentencepiece.ModelProto_SentencePiece_CONTROL,
+			sentencepiece.ModelProto_SentencePiece_UNUSED,
+			sentencepiece.ModelProto_SentencePiece_BYTE:
+			v.Types = append(v.Types, int32(t))
+		default:
+			v.Types = append(v.Types, int32(sentencepiece.ModelProto_SentencePiece_NORMAL))
+		}
+	}
+
+	f, err := os.Open(filepath.Join(d, "added_tokens.json"))
+	if errors.Is(err, os.ErrNotExist) {
+		return &v, nil
+	} else if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	var atm map[string]int
+	if err := json.NewDecoder(f).Decode(&atm); err != nil {
+		return nil, err
+	}
+
+	type t struct {
+		id      int
+		content string
+	}
+
+	var ts []t
+	for content, id := range atm {
+		ts = append(ts, t{id, content})
+	}
+
+	slices.SortFunc(ts, func(i, j t) int {
+		return cmp.Compare(i.id, j.id)
+	})
+
+	n := len(v.Tokens)
+	for i, t := range ts {
+		if t.id != i+n {
+			return nil, fmt.Errorf("invalid token id: %d", t.id)
+		}
+
+		v.Tokens = append(v.Tokens, t.content)
+		v.Scores = append(v.Scores, -1000.0)
+		v.Types = append(v.Types, tokenTypeUserDefined)
+	}
+
+	return &v, nil
+}
diff --git a/convert/torch.go b/convert/torch.go
deleted file mode 100644
index 55414adc..00000000
--- a/convert/torch.go
+++ /dev/null
@@ -1,287 +0,0 @@
-package convert
-
-import (
-	"encoding/binary"
-	"encoding/json"
-	"fmt"
-	"io"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"regexp"
-	"strings"
-
-	"github.com/nlpodyssey/gopickle/pytorch"
-	"github.com/nlpodyssey/gopickle/types"
-	"github.com/x448/float16"
-
-	"github.com/ollama/ollama/llm"
-)
-
-type torchWriterTo struct {
-	t *llm.Tensor
-
-	params *Params
-	bo     ByteOrder
-
-	storage  pytorch.StorageInterface
-	repacker func(string, []float32, []uint64) ([]float32, error)
-}
-
-type TorchFormat struct{}
-
-func (tf *TorchFormat) GetTensors(dirpath string, params *Params) ([]llm.Tensor, error) {
-	slog.Debug("getting torch tensors")
-
-	var files []string
-	if pt, _ := filepath.Glob(filepath.Join(dirpath, "consolidated*.pth")); len(pt) > 0 {
-		files = append(files, pt...)
-	} else if pt, _ := filepath.Glob(filepath.Join(dirpath, "pytorch_model*.pth")); len(pt) > 0 {
-		files = append(files, pt...)
-	}
-
-	var offset uint64
-	var tensors []llm.Tensor
-	for _, fn := range files {
-		m, err := pytorch.Load(fn)
-		if err != nil {
-			slog.Error(fmt.Sprintf("error unpickling: %q", err))
-			return []llm.Tensor{}, err
-		}
-
-		for _, k := range m.(*types.Dict).Keys() {
-			if strings.HasSuffix(k.(string), "self_attn.rotary_emb.inv_freq") {
-				continue
-			}
-
-			t, _ := m.(*types.Dict).Get(k)
-			tshape := t.(*pytorch.Tensor).Size
-
-			var size uint64
-			var kind uint32
-			switch len(tshape) {
-			case 0:
-				continue
-			case 1:
-				// convert to float32
-				kind = 0
-				size = uint64(tshape[0] * 4)
-			case 2:
-				// convert to float16
-				kind = 1
-				size = uint64(tshape[0] * tshape[1] * 2)
-			}
-
-			ggufName, err := tf.GetLayerName(k.(string))
-			if err != nil {
-				slog.Error(err.Error())
-				return nil, err
-			}
-			slog.Debug(fmt.Sprintf("'%35s': '%30s' %10d [%#v]", k.(string), ggufName, size, tshape))
-
-			shape := []uint64{0, 0, 0, 0}
-			for i := range tshape {
-				shape[i] = uint64(tshape[i])
-			}
-
-			tensor := llm.Tensor{
-				Name:   ggufName,
-				Kind:   kind,
-				Offset: offset, // calculate the offset
-				Shape:  shape,
-			}
-
-			tensor.WriterTo = torchWriterTo{
-				t:       &tensor,
-				params:  params,
-				bo:      params.ByteOrder,
-				storage: t.(*pytorch.Tensor).Source,
-			}
-
-			tensors = append(tensors, tensor)
-			offset += size
-		}
-	}
-
-	return tensors, nil
-}
-
-func getAltParams(dirpath string) (*Params, error) {
-	f, err := os.Open(filepath.Join(dirpath, "params.json"))
-	if err != nil {
-		slog.Error("no params.json")
-		return nil, err
-	}
-	defer f.Close()
-
-	type TorchParams struct {
-		HiddenSize     int     `json:"dim"`
-		AttentionHeads int     `json:"n_heads"`
-		KeyValHeads    int     `json:"n_kv_heads"`
-		HiddenLayers   int     `json:"n_layers"`
-		RopeTheta      float64 `json:"rope_theta"`
-		NormEPS        float64 `json:"norm_eps"`
-	}
-
-	var tparams TorchParams
-
-	d := json.NewDecoder(f)
-	err = d.Decode(&tparams)
-	if err != nil {
-		return nil, err
-	}
-
-	params := &Params{
-		Architectures:  []string{"LlamaForCausalLM"},
-		HiddenSize:     tparams.HiddenSize,
-		AttentionHeads: tparams.AttentionHeads,
-		KeyValHeads:    tparams.KeyValHeads,
-		HiddenLayers:   tparams.HiddenLayers,
-		NormEPS:        tparams.NormEPS,
-	}
-
-	switch {
-	case tparams.RopeTheta == 1000000:
-		// Codellama
-		params.ContextSize = 16384
-	case tparams.NormEPS == 1e-06:
-		// llama2
-		slog.Debug("Found llama2 - setting context size to 4096")
-		params.ContextSize = 4096
-	default:
-		params.ContextSize = 2048
-	}
-
-	params.ByteOrder = binary.LittleEndian
-	return params, nil
-}
-
-func (m *TorchFormat) GetParams(dirpath string) (*Params, error) {
-	f, err := os.Open(filepath.Join(dirpath, "config.json"))
-	if err != nil {
-		if os.IsNotExist(err) {
-			// try params.json instead
-			return getAltParams(dirpath)
-		} else {
-			return nil, err
-		}
-	}
-
-	var params Params
-	d := json.NewDecoder(f)
-	err = d.Decode(&params)
-	if err != nil {
-		return nil, err
-	}
-
-	params.ByteOrder = binary.LittleEndian
-	return &params, nil
-}
-
-func (m *TorchFormat) GetLayerName(n string) (string, error) {
-	directMap := map[string]string{
-		"tok_embeddings.weight":     "token_embd.weight",
-		"output.weight":             "output.weight",
-		"norm.weight":               "output_norm.weight",
-		"rope.freqs":                "rope_freqs.weight",
-		"model.embed_tokens.weight": "token_embd.weight",
-		"lm_head.weight":            "output.weight",
-		"model.norm.weight":         "output_norm.weight",
-	}
-
-	lMap := map[string]string{
-		"layers.(\\d+).attention_norm.weight":                 "blk.$1.attn_norm.weight",
-		"layers.(\\d+).attention_output_norm.weight":          "blk.$1.attn_norm.weight",
-		"layers.(\\d+).feed_forward.w2.weight":                "blk.$1.ffn_down.weight",
-		"layers.(\\d+).feed_forward.w1.weight":                "blk.$1.ffn_gate.weight",
-		"layers.(\\d+).feed_forward.w3.weight":                "blk.$1.ffn_up.weight",
-		"layers.(\\d+).ffn_norm.weight":                       "blk.$1.ffn_norm.weight",
-		"layers.(\\d+).attention.wk.weight":                   "blk.$1.attn_k.weight",
-		"layers.(\\d+).attention.wo.weight":                   "blk.$1.attn_output.weight",
-		"layers.(\\d+).attention.wq.weight":                   "blk.$1.attn_q.weight",
-		"layers.(\\d+).attention.wv.weight":                   "blk.$1.attn_v.weight",
-		"model.layers.(\\d+).input_layernorm.weight":          "blk.$1.attn_norm.weight",
-		"model.layers.(\\d+).mlp.down_proj.weight":            "blk.$1.ffn_down.weight",
-		"model.layers.(\\d+).mlp.gate_proj.weight":            "blk.$1.ffn_gate.weight",
-		"model.layers.(\\d+).mlp.up_proj.weight":              "blk.$1.ffn_up.weight",
-		"model.layers.(\\d+).post_attention_layernorm.weight": "blk.$1.ffn_norm.weight",
-		"model.layers.(\\d+).self_attn.k_proj.weight":         "blk.$1.attn_k.weight",
-		"model.layers.(\\d+).self_attn.o_proj.weight":         "blk.$1.attn_output.weight",
-		"model.layers.(\\d+).self_attn.q_proj.weight":         "blk.$1.attn_q.weight",
-		"model.layers.(\\d+).self_attn.v_proj.weight":         "blk.$1.attn_v.weight",
-	}
-
-	v, ok := directMap[n]
-	if ok {
-		return v, nil
-	}
-
-	// quick hack to rename the layers to gguf format
-	for k, v := range lMap {
-		re := regexp.MustCompile(k)
-		newName := re.ReplaceAllString(n, v)
-		if newName != n {
-			return newName, nil
-		}
-	}
-
-	return "", fmt.Errorf("couldn't find a layer name for '%s'", n)
-}
-
-func (r torchWriterTo) WriteTo(w io.Writer) (n int64, err error) {
-	var f32s []float32
-	switch s := r.storage.(type) {
-	case *pytorch.FloatStorage:
-		f32s = s.Data
-	case *pytorch.HalfStorage:
-		f32s = s.Data
-	case *pytorch.BFloat16Storage:
-		f32s = s.Data
-	default:
-		return 0, fmt.Errorf("unknown data type: %T", s)
-	}
-
-	if r.repacker != nil {
-		f32s, err = r.repacker(r.t.Name, f32s, r.t.Shape)
-		if err != nil {
-			return 0, err
-		}
-	}
-
-	switch r.t.Kind {
-	case 0:
-		return 0, binary.Write(w, r.bo, f32s)
-	case 1:
-		f16s := make([]uint16, len(f32s))
-		for i := range f32s {
-			f16s[i] = float16.Fromfloat32(f32s[i]).Bits()
-		}
-
-		return 0, binary.Write(w, r.bo, f16s)
-	default:
-		return 0, fmt.Errorf("unknown storage type: %d", r.t.Kind)
-	}
-}
-
-func (m *TorchFormat) GetModelArch(name, dirPath string, params *Params) (ModelArch, error) {
-	switch len(params.Architectures) {
-	case 0:
-		return nil, fmt.Errorf("No architecture specified to convert")
-	case 1:
-		switch params.Architectures[0] {
-		case "LlamaForCausalLM":
-			return &LlamaModel{
-				ModelData{
-					Name:   name,
-					Path:   dirPath,
-					Params: params,
-					Format: m,
-				},
-			}, nil
-		default:
-			return nil, fmt.Errorf("Models based on '%s' are not yet supported", params.Architectures[0])
-		}
-	}
-
-	return nil, fmt.Errorf("Unknown error")
-}
diff --git a/llm/gguf.go b/llm/gguf.go
index aadfc4ba..e61babf2 100644
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -2,11 +2,16 @@ package llm
 
 import (
 	"bytes"
+	"cmp"
 	"encoding/binary"
 	"encoding/json"
 	"fmt"
 	"io"
+	"log/slog"
+	"slices"
 	"strings"
+
+	"golang.org/x/exp/maps"
 )
 
 type containerGGUF struct {
@@ -88,7 +93,7 @@ type gguf struct {
 	kv      KV
 	tensors []*Tensor
 
-	parameters uint64
+	parameters   uint64
 	tensorOffset uint64
 
 	scratch [16 << 10]byte
@@ -101,10 +106,6 @@ func newGGUF(container *containerGGUF) *gguf {
 	}
 }
 
-func NewGGUFV3(bo binary.ByteOrder) *gguf {
-	return newGGUF(&containerGGUF{ByteOrder: bo, Version: 3})
-}
-
 func (llm *gguf) KV() KV {
 	return llm.kv
 }
@@ -203,7 +204,7 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
 			return fmt.Errorf("failed to read tensor dimensions: %w", err)
 		}
 
-		shape := [4]uint64{1, 1, 1, 1}
+		shape := make([]uint64, dims)
 		for i := 0; uint32(i) < dims; i++ {
 			shape[i], err = readGGUF[uint64](llm, rs)
 			if err != nil {
@@ -245,7 +246,7 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
 		return err
 	}
 
-	padding := llm.padding(offset, int64(alignment))
+	padding := ggufPadding(offset, int64(alignment))
 	llm.tensorOffset = uint64(offset + padding)
 
 	for _, tensor := range llm.tensors {
@@ -254,7 +255,7 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
 			return fmt.Errorf("failed to get current offset: %w", err)
 		}
 
-		padding := llm.padding(offset, int64(alignment))
+		padding := ggufPadding(offset, int64(alignment))
 		if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
 			return fmt.Errorf("failed to seek to init padding: %w", err)
 		}
@@ -273,12 +274,12 @@ func readGGUF[T any](llm *gguf, r io.Reader) (T, error) {
 	return t, err
 }
 
-func writeGGUF[V any](llm *gguf, w io.Writer, t uint32, v V) error {
-	if err := binary.Write(w, llm.ByteOrder, t); err != nil {
+func writeGGUF[V any](w io.Writer, t uint32, v V) error {
+	if err := binary.Write(w, binary.LittleEndian, t); err != nil {
 		return err
 	}
 
-	return binary.Write(w, llm.ByteOrder, v)
+	return binary.Write(w, binary.LittleEndian, v)
 }
 
 func readGGUFV1String(llm *gguf, r io.Reader) (string, error) {
@@ -342,12 +343,12 @@ func readGGUFString(llm *gguf, r io.Reader) (string, error) {
 	return string(buf), nil
 }
 
-func writeGGUFString(llm *gguf, w io.Writer, s string) error {
-	if err := binary.Write(w, llm.ByteOrder, ggufTypeString); err != nil {
+func writeGGUFString(w io.Writer, s string) error {
+	if err := binary.Write(w, binary.LittleEndian, ggufTypeString); err != nil {
 		return err
 	}
 
-	if err := binary.Write(w, llm.ByteOrder, uint64(len(s))); err != nil {
+	if err := binary.Write(w, binary.LittleEndian, uint64(len(s))); err != nil {
 		return err
 	}
 
@@ -488,21 +489,21 @@ func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
 	return a, nil
 }
 
-func writeGGUFArray[S ~[]E, E any](llm *gguf, w io.Writer, t uint32, s S) error {
-	if err := binary.Write(w, llm.ByteOrder, ggufTypeArray); err != nil {
+func writeGGUFArray[S ~[]E, E any](w io.Writer, t uint32, s S) error {
+	if err := binary.Write(w, binary.LittleEndian, ggufTypeArray); err != nil {
 		return err
 	}
 
-	if err := binary.Write(w, llm.ByteOrder, t); err != nil {
+	if err := binary.Write(w, binary.LittleEndian, t); err != nil {
 		return err
 	}
 
-	if err := binary.Write(w, llm.ByteOrder, uint64(len(s))); err != nil {
+	if err := binary.Write(w, binary.LittleEndian, uint64(len(s))); err != nil {
 		return err
 	}
 
 	for _, e := range s {
-		if err := binary.Write(w, llm.ByteOrder, e); err != nil {
+		if err := binary.Write(w, binary.LittleEndian, e); err != nil {
 			return err
 		}
 	}
@@ -510,194 +511,55 @@ func writeGGUFArray[S ~[]E, E any](llm *gguf, w io.Writer, t uint32, s S) error
 	return nil
 }
 
-var ggufKVOrder = map[string][]string{
-	"llama": {
-		"general.architecture",
-		"general.name",
-		"llama.vocab_size",
-		"llama.context_length",
-		"llama.embedding_length",
-		"llama.block_count",
-		"llama.feed_forward_length",
-		"llama.attention.head_count",
-		"llama.attention.head_count_kv",
-		"llama.attention.layer_norm_rms_epsilon",
-		"llama.rope.freq_base",
-		"llama.rope.dimension_count",
-		"llama.expert_count",
-		"llama.expert_used_count",
-		"gemma.context_length",
-		"gemma.embedding_length",
-		"gemma.block_count",
-		"gemma.feed_forward_length",
-		"gemma.attention.head_count",
-		"gemma.attention.head_count_kv",
-		"gemma.attention.layer_norm_rms_epsilon",
-		"gemma.attention.key_length",
-		"gemma.attention.value_length",
-		"general.file_type",
-		"tokenizer.ggml.pre",
-		"tokenizer.ggml.model",
-		"tokenizer.ggml.tokens",
-		"tokenizer.ggml.scores",
-		"tokenizer.ggml.merges",
-		"tokenizer.ggml.token_type",
-		"tokenizer.ggml.bos_token_id",
-		"tokenizer.ggml.eos_token_id",
-		"tokenizer.ggml.unknown_token_id",
-		"tokenizer.ggml.padding_token_id",
-		"tokenizer.ggml.add_bos_token",
-		"tokenizer.ggml.add_eos_token",
-		"tokenizer.chat_template",
-		"bert.pooling_type",
-	},
-}
-
-func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
-	switch llm.Version {
-	case 3:
-		llm.V3.NumTensor = uint64(len(tensors))
-		llm.V3.NumKV = uint64(len(kv))
-	default:
-		return fmt.Errorf("not implemented: ggufv%d", llm.Version)
-	}
-
-	if err := binary.Write(ws, llm.ByteOrder, []byte("GGUF")); err != nil {
+func WriteGGUF(ws io.WriteSeeker, kv KV, ts []*Tensor) error {
+	if err := binary.Write(ws, binary.LittleEndian, []byte("GGUF")); err != nil {
 		return err
 	}
 
-	if err := binary.Write(ws, llm.ByteOrder, llm.Version); err != nil {
+	if err := binary.Write(ws, binary.LittleEndian, uint32(3)); err != nil {
 		return err
 	}
 
-	if err := binary.Write(ws, llm.ByteOrder, llm.numTensor()); err != nil {
+	if err := binary.Write(ws, binary.LittleEndian, uint64(len(ts))); err != nil {
 		return err
 	}
 
-	if err := binary.Write(ws, llm.ByteOrder, llm.numKV()); err != nil {
+	if err := binary.Write(ws, binary.LittleEndian, uint64(len(kv))); err != nil {
 		return err
 	}
 
-	kvCheck := make(map[string]bool)
-	for k := range kv {
-		kvCheck[k] = false
-	}
+	keys := maps.Keys(kv)
+	slices.Sort(keys)
 
-	for _, k := range ggufKVOrder["llama"] {
-		v, ok := kv[k]
-		if !ok {
-			continue
-		}
-		kvCheck[k] = true
-
-		if err := binary.Write(ws, llm.ByteOrder, uint64(len(k))); err != nil {
-			return err
-		}
-
-		if err := binary.Write(ws, llm.ByteOrder, []byte(k)); err != nil {
-			return err
-		}
-
-		var err error
-		switch v := v.(type) {
-		case uint32:
-			err = writeGGUF(llm, ws, ggufTypeUint32, v)
-		case float32:
-			err = writeGGUF(llm, ws, ggufTypeFloat32, v)
-		case bool:
-			err = writeGGUF(llm, ws, ggufTypeBool, v)
-		case string:
-			err = writeGGUFString(llm, ws, v)
-		case []int32:
-			err = writeGGUFArray(llm, ws, ggufTypeInt32, v)
-		case []uint32:
-			err = writeGGUFArray(llm, ws, ggufTypeUint32, v)
-		case []float32:
-			err = writeGGUFArray(llm, ws, ggufTypeFloat32, v)
-		case []string:
-			if err := binary.Write(ws, llm.ByteOrder, ggufTypeArray); err != nil {
-				return err
-			}
-
-			if err := binary.Write(ws, llm.ByteOrder, ggufTypeString); err != nil {
-				return err
-			}
-
-			if err := binary.Write(ws, llm.ByteOrder, uint64(len(v))); err != nil {
-				return err
-			}
-
-			for _, e := range v {
-				if err := binary.Write(ws, llm.ByteOrder, uint64(len(e))); err != nil {
-					return err
-				}
-
-				if err := binary.Write(ws, llm.ByteOrder, []byte(e)); err != nil {
-					return err
-				}
-			}
-		default:
-			return fmt.Errorf("improper type for '%s'", k)
-		}
-		if err != nil {
+	for _, key := range keys {
+		if err := ggufWriteKV(ws, key, kv[key]); err != nil {
 			return err
 		}
 	}
 
-	for k, v := range kvCheck {
-		if !v {
-			return fmt.Errorf("Didn't know how to write kv %s", k)
+	slices.SortFunc(ts, func(a, b *Tensor) int {
+		var i, j int
+		if n, err := fmt.Sscanf(a.Name, "blk.%d", &i); err != nil || n != 1 {
+			return cmp.Compare(a.Name, b.Name)
+		} else if n, err := fmt.Sscanf(b.Name, "blk.%d", &j); err != nil || n != 1 {
+			return cmp.Compare(a.Name, b.Name)
 		}
-	}
 
-	for _, tensor := range tensors {
-		if err := binary.Write(ws, llm.ByteOrder, uint64(len(tensor.Name))); err != nil {
-			return err
-		}
-
-		if err := binary.Write(ws, llm.ByteOrder, []byte(tensor.Name)); err != nil {
-			return err
-		}
-
-		var dims int
-		for cnt := range len(tensor.Shape) {
-			if tensor.Shape[cnt] > 0 {
-				dims++
-			}
-		}
-
-		if err := binary.Write(ws, llm.ByteOrder, uint32(dims)); err != nil {
-			return err
-		}
-
-		for i := range dims {
-			if err := binary.Write(ws, llm.ByteOrder, tensor.Shape[dims-1-i]); err != nil {
-				return err
-			}
-		}
-
-		if err := binary.Write(ws, llm.ByteOrder, tensor.Kind); err != nil {
-			return err
-		}
-
-		if err := binary.Write(ws, llm.ByteOrder, tensor.Offset); err != nil {
+		return cmp.Compare(i, j)
+	})
+
+	var s uint64
+	for _, t := range ts {
+		t.Offset = s
+		if err := ggufWriteTensorInfo(ws, t); err != nil {
 			return err
 		}
+		s += t.Size()
 	}
 
 	var alignment int64 = 32
-	for _, tensor := range tensors {
-		offset, err := ws.Seek(0, io.SeekCurrent)
-		if err != nil {
-			return err
-		}
-
-		padding := llm.padding(offset, alignment)
-		if err := binary.Write(ws, llm.ByteOrder, bytes.Repeat([]byte{0}, int(padding))); err != nil {
-			return err
-		}
-
-		if _, err := tensor.WriteTo(ws); err != nil {
+	for _, t := range ts {
+		if err := ggufWriteTensor(ws, t, alignment); err != nil {
 			return err
 		}
 	}
@@ -705,6 +567,102 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
 	return nil
 }
 
-func (gguf) padding(offset, align int64) int64 {
+func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
+	slog.Debug(k, "type", fmt.Sprintf("%T", v))
+	if err := binary.Write(ws, binary.LittleEndian, uint64(len(k))); err != nil {
+		return err
+	}
+
+	if err := binary.Write(ws, binary.LittleEndian, []byte(k)); err != nil {
+		return err
+	}
+
+	var err error
+	switch v := v.(type) {
+	case uint32:
+		err = writeGGUF(ws, ggufTypeUint32, v)
+	case float32:
+		err = writeGGUF(ws, ggufTypeFloat32, v)
+	case bool:
+		err = writeGGUF(ws, ggufTypeBool, v)
+	case string:
+		err = writeGGUFString(ws, v)
+	case []int32:
+		err = writeGGUFArray(ws, ggufTypeInt32, v)
+	case []uint32:
+		err = writeGGUFArray(ws, ggufTypeUint32, v)
+	case []float32:
+		err = writeGGUFArray(ws, ggufTypeFloat32, v)
+	case []string:
+		if err := binary.Write(ws, binary.LittleEndian, ggufTypeArray); err != nil {
+			return err
+		}
+
+		if err := binary.Write(ws, binary.LittleEndian, ggufTypeString); err != nil {
+			return err
+		}
+
+		if err := binary.Write(ws, binary.LittleEndian, uint64(len(v))); err != nil {
+			return err
+		}
+
+		for _, e := range v {
+			if err := binary.Write(ws, binary.LittleEndian, uint64(len(e))); err != nil {
+				return err
+			}
+
+			if err := binary.Write(ws, binary.LittleEndian, []byte(e)); err != nil {
+				return err
+			}
+		}
+	default:
+		return fmt.Errorf("improper type for '%s'", k)
+	}
+
+	return err
+}
+
+func ggufWriteTensorInfo(ws io.WriteSeeker, t *Tensor) error {
+	slog.Debug(t.Name, "kind", t.Kind, "shape", t.Shape, "offset", t.Offset)
+	if err := binary.Write(ws, binary.LittleEndian, uint64(len(t.Name))); err != nil {
+		return err
+	}
+
+	if err := binary.Write(ws, binary.LittleEndian, []byte(t.Name)); err != nil {
+		return err
+	}
+
+	if err := binary.Write(ws, binary.LittleEndian, uint32(len(t.Shape))); err != nil {
+		return err
+	}
+
+	for i := range len(t.Shape) {
+		if err := binary.Write(ws, binary.LittleEndian, t.Shape[len(t.Shape)-i-1]); err != nil {
+			return err
+		}
+	}
+
+	if err := binary.Write(ws, binary.LittleEndian, t.Kind); err != nil {
+		return err
+	}
+
+	return binary.Write(ws, binary.LittleEndian, t.Offset)
+}
+
+func ggufWriteTensor(ws io.WriteSeeker, t *Tensor, alignment int64) error {
+	offset, err := ws.Seek(0, io.SeekCurrent)
+	if err != nil {
+		return err
+	}
+
+	if err := binary.Write(ws, binary.LittleEndian, bytes.Repeat([]byte{0}, int(ggufPadding(offset, alignment)))); err != nil {
+		return err
+	}
+
+	_, err = t.WriteTo(ws)
+	return err
+}
+
+func ggufPadding(offset, align int64) int64 {
 	return (align - offset%align) % align
 }
diff --git a/llm/memory_test.go b/llm/memory_test.go
index 06ae7438..18c797ee 100644
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -2,7 +2,6 @@ package llm
 
 import (
 	"bytes"
-	"encoding/binary"
 	"fmt"
 	"os"
 	"testing"
@@ -20,10 +19,9 @@ func TestEstimateGPULayers(t *testing.T) {
 	f, err := os.CreateTemp(t.TempDir(), modelName)
 	require.NoError(t, err)
 	defer f.Close()
-	gguf := NewGGUFV3(binary.LittleEndian)
 	inputLayerCount := 5
 
-	tensors := []Tensor{
+	tensors := []*Tensor{
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
@@ -32,7 +30,7 @@ func TestEstimateGPULayers(t *testing.T) {
 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 	}
 	assert.Len(t, tensors, inputLayerCount+1)
-	err = gguf.Encode(f, KV{
+	err = WriteGGUF(f, KV{
 		"general.architecture":          "llama",
 		"general.name":                  "name",
 		"llama.context_length":          uint32(32),
diff --git a/server/model.go b/server/model.go
index c6d3078f..81272a34 100644
--- a/server/model.go
+++ b/server/model.go
@@ -143,30 +143,6 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a
 		return nil, err
 	}
 
-	mf, err := convert.GetModelFormat(tempDir)
-	if err != nil {
-		return nil, err
-	}
-
-	params, err := mf.GetParams(tempDir)
-	if err != nil {
-		return nil, err
-	}
-
-	mArch, err := mf.GetModelArch("", tempDir, params)
-	if err != nil {
-		return nil, err
-	}
-
-	fn(api.ProgressResponse{Status: "processing tensors"})
-	if err := mArch.GetTensors(); err != nil {
-		return nil, err
-	}
-
-	if err := mArch.LoadVocab(); err != nil {
-		return nil, err
-	}
-
 	fn(api.ProgressResponse{Status: "converting model"})
 
 	// TODO(mxyng): this should write directly into a layer
@@ -178,7 +154,7 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a
 	defer temp.Close()
 	defer os.Remove(temp.Name())
 
-	if err = mArch.WriteGGUF(temp); err != nil {
+	if err := convert.Convert(tempDir, temp); err != nil {
 		return nil, err
 	}
 
diff --git a/server/routes_create_test.go b/server/routes_create_test.go
index 8c714209..4d616d8d 100644
--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@@ -2,7 +2,6 @@ package server
 
 import (
 	"bytes"
-	"encoding/binary"
 	"encoding/json"
 	"fmt"
 	"io"
@@ -20,7 +19,7 @@ import (
 
 var stream bool = false
 
-func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) string {
+func createBinFile(t *testing.T, kv map[string]any, ti []*llm.Tensor) string {
 	t.Helper()
 
 	f, err := os.CreateTemp(t.TempDir(), "")
@@ -29,7 +28,7 @@ func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) string {
 	}
 	defer f.Close()
 
-	if err := llm.NewGGUFV3(binary.LittleEndian).Encode(f, kv, ti); err != nil {
+	if err := llm.WriteGGUF(f, kv, ti); err != nil {
 		t.Fatal(err)
 	}
 
diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go
index 5c0caff1..02f95dd2 100644
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -101,7 +101,7 @@ func TestGenerateChat(t *testing.T) {
 			"tokenizer.ggml.tokens":         []string{""},
 			"tokenizer.ggml.scores":         []float32{0},
 			"tokenizer.ggml.token_type":     []int32{0},
-		}, []llm.Tensor{
+		}, []*llm.Tensor{
 			{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
@@ -149,7 +149,7 @@ func TestGenerateChat(t *testing.T) {
 			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
 				"general.architecture": "bert",
 				"bert.pooling_type":    uint32(0),
-			}, []llm.Tensor{})),
+			}, []*llm.Tensor{})),
 			Stream: &stream,
 		})
 
@@ -399,7 +399,7 @@ func TestGenerate(t *testing.T) {
 			"tokenizer.ggml.tokens":         []string{""},
 			"tokenizer.ggml.scores":         []float32{0},
 			"tokenizer.ggml.token_type":     []int32{0},
-		}, []llm.Tensor{
+		}, []*llm.Tensor{
 			{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
@@ -447,7 +447,7 @@ func TestGenerate(t *testing.T) {
 			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
 				"general.architecture": "bert",
 				"bert.pooling_type":    uint32(0),
-			}, []llm.Tensor{})),
+			}, []*llm.Tensor{})),
 			Stream: &stream,
 		})
 
diff --git a/server/sched_test.go b/server/sched_test.go
index 6959dace..f3c55514 100644
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -3,7 +3,6 @@ package server
 import (
 	"bytes"
 	"context"
-	"encoding/binary"
 	"fmt"
 	"log/slog"
 	"os"
@@ -114,8 +113,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 	require.NoError(t, err)
 	defer f.Close()
 
-	gguf := llm.NewGGUFV3(binary.LittleEndian)
-	err = gguf.Encode(f, llm.KV{
+	require.NoError(t, llm.WriteGGUF(f, llm.KV{
 		"general.architecture":          "llama",
 		"general.name":                  "name",
 		"llama.context_length":          uint32(32),
@@ -126,10 +124,10 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 		"tokenizer.ggml.tokens":         []string{" "},
 		"tokenizer.ggml.scores":         []float32{0},
 		"tokenizer.ggml.token_type":     []int32{0},
-	}, []llm.Tensor{
+	}, []*llm.Tensor{
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
-	})
+	}))
 	require.NoError(t, err)
 
 	fname := f.Name()

From df993fa37bde19039231001be9f852386a12a860 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 8 Jul 2024 16:59:48 -0700
Subject: [PATCH 64/79] comments

---
 convert/convert.go             | 46 +++++++++++++++++-----------------
 convert/convert_gemma.go       |  6 ++---
 convert/convert_llama.go       |  6 ++---
 convert/convert_mixtral.go     |  6 ++---
 convert/reader.go              |  9 +++++--
 convert/reader_safetensors.go  |  5 ++--
 convert/tokenizer.go           | 15 +++++------
 llm/gguf.go                    | 17 +++++--------
 llm/memory_test.go             |  2 +-
 server/routes_create_test.go   |  2 +-
 server/routes_generate_test.go |  8 +++---
 server/sched_test.go           |  2 +-
 12 files changed, 63 insertions(+), 61 deletions(-)

diff --git a/convert/convert.go b/convert/convert.go
index 4ad64d72..30c5a53f 100644
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -40,13 +40,13 @@ func (Parameters) KV(t *Tokenizer) llm.KV {
 	return kv
 }
 
-func (Parameters) specialTypes() []string {
+func (Parameters) specialTokenTypes() []string {
 	return []string{
 		"bos", "eos", "unk", "sep", "pad", "cls", "mask",
 	}
 }
 
-func (Parameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []*llm.Tensor) error {
+func (Parameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
 	return llm.WriteGGUF(ws, kv, ts)
 }
 
@@ -54,24 +54,27 @@ type Converter interface {
 	// KV maps parameters to LLM key-values
 	KV(*Tokenizer) llm.KV
 	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
-	Tensors([]Tensor) []*llm.Tensor
+	Tensors([]Tensor) []llm.Tensor
 
 	// tensorName returns the LLM tensor name for a specific input name
 	tensorName(string) string
-	// specialTypes returns any special token types the model uses
-	specialTypes() []string
-	writeFile(io.WriteSeeker, llm.KV, []*llm.Tensor) error
+	// specialTokenTypes returns any special token types the model uses
+	specialTokenTypes() []string
+	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
 }
 
-func Convert(d string, ws io.WriteSeeker) error {
-	f, err := os.Open(filepath.Join(d, "config.json"))
+// Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
+// and files it finds in the input path.
+// Supported input model formats include safetensors.
+// Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
+func Convert(path string, ws io.WriteSeeker) error {
+	bts, err := os.ReadFile(filepath.Join(path, "config.json"))
 	if err != nil {
 		return err
 	}
-	defer f.Close()
 
 	var p Parameters
-	if err := json.NewDecoder(f).Decode(&p); err != nil {
+	if err := json.Unmarshal(bts, &p); err != nil {
 		return err
 	}
 
@@ -79,28 +82,23 @@ func Convert(d string, ws io.WriteSeeker) error {
 		return errors.New("unknown architecture")
 	}
 
-	var c Converter
+	var conv Converter
 	switch p.Architectures[0] {
 	case "LlamaForCausalLM", "MistralForCausalLM":
-		c = &llama{}
+		conv = &llama{}
 	case "MixtralForCausalLM":
-		c = &mixtral{}
+		conv = &mixtral{}
 	case "GemmaForCausalLM":
-		c = &gemma{}
+		conv = &gemma{}
 	default:
 		return errors.New("unsupported architecture")
 	}
 
-	bts, err := os.ReadFile(filepath.Join(d, "config.json"))
-	if err != nil {
+	if err := json.Unmarshal(bts, conv); err != nil {
 		return err
 	}
 
-	if err := json.Unmarshal(bts, c); err != nil {
-		return err
-	}
-
-	t, err := parseTokenizer(d, c.specialTypes())
+	t, err := parseTokenizer(path, conv.specialTokenTypes())
 	if err != nil {
 		return err
 	}
@@ -112,12 +110,14 @@ func Convert(d string, ws io.WriteSeeker) error {
 			t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
 			t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
 		}
+	} else {
+		slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
 	}
 
-	ts, err := parseTensors(d)
+	ts, err := parseTensors(path)
 	if err != nil {
 		return err
 	}
 
-	return c.writeFile(ws, c.KV(t), c.Tensors(ts))
+	return conv.writeFile(ws, conv.KV(t), conv.Tensors(ts))
 }
diff --git a/convert/convert_gemma.go b/convert/convert_gemma.go
index 332fee7f..9213e157 100644
--- a/convert/convert_gemma.go
+++ b/convert/convert_gemma.go
@@ -43,15 +43,15 @@ func (p *gemma) KV(t *Tokenizer) llm.KV {
 	return kv
 }
 
-func (p *gemma) Tensors(ts []Tensor) []*llm.Tensor {
-	var out []*llm.Tensor
+func (p *gemma) Tensors(ts []Tensor) []llm.Tensor {
+	var out []llm.Tensor
 	for _, t := range ts {
 		name := p.tensorName(t.Name())
 		if strings.HasSuffix(name, "_norm.weight") {
 			t.SetRepacker(p.addOne)
 		}
 
-		out = append(out, &llm.Tensor{
+		out = append(out, llm.Tensor{
 			Name:     name,
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
diff --git a/convert/convert_llama.go b/convert/convert_llama.go
index 700049d3..ed6469c5 100644
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -96,8 +96,8 @@ func (p *llama) KV(t *Tokenizer) llm.KV {
 	return kv
 }
 
-func (p *llama) Tensors(ts []Tensor) []*llm.Tensor {
-	var out []*llm.Tensor
+func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
+	var out []llm.Tensor
 	for _, t := range ts {
 		name := p.tensorName(t.Name())
 		if strings.HasSuffix(name, "attn_q.weight") ||
@@ -105,7 +105,7 @@ func (p *llama) Tensors(ts []Tensor) []*llm.Tensor {
 			t.SetRepacker(p.repack)
 		}
 
-		out = append(out, &llm.Tensor{
+		out = append(out, llm.Tensor{
 			Name:     name,
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
diff --git a/convert/convert_mixtral.go b/convert/convert_mixtral.go
index c55a27f8..3263a27b 100644
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@@ -31,7 +31,7 @@ func (p *mixtral) KV(t *Tokenizer) llm.KV {
 	return kv
 }
 
-func (p *mixtral) Tensors(ts []Tensor) []*llm.Tensor {
+func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor {
 	oldnew := []string{
 		"model.layers", "blk",
 		"w1", "ffn_gate_exps",
@@ -58,10 +58,10 @@ func (p *mixtral) Tensors(ts []Tensor) []*llm.Tensor {
 		return true
 	})
 
-	var out []*llm.Tensor
+	var out []llm.Tensor
 	for n, e := range experts {
 		// TODO(mxyng): sanity check experts
-		out = append(out, &llm.Tensor{
+		out = append(out, llm.Tensor{
 			Name:     n,
 			Kind:     e[0].Kind(),
 			Shape:    append([]uint64{uint64(len(e))}, e[0].Shape()...),
diff --git a/convert/reader.go b/convert/reader.go
index 9be8ac2e..11ccaa81 100644
--- a/convert/reader.go
+++ b/convert/reader.go
@@ -29,6 +29,11 @@ func (t tensorBase) Shape() []uint64 {
 	return t.shape
 }
 
+const (
+	tensorKindF32 uint32 = iota
+	tensorKindF16
+)
+
 func (t tensorBase) Kind() uint32 {
 	if strings.HasSuffix(t.name, ".block_sparse_moe.gate.weight") {
 		return 0
@@ -38,9 +43,9 @@ func (t tensorBase) Kind() uint32 {
 	case 0:
 		panic("invalid tensor shape")
 	case 1:
-		return 0
+		return tensorKindF32
 	default:
-		return 1
+		return tensorKindF16
 	}
 }
 
diff --git a/convert/reader_safetensors.go b/convert/reader_safetensors.go
index 440581af..d43c59a5 100644
--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@@ -66,6 +66,7 @@ func parseSafetensors(ps ...string) ([]Tensor, error) {
 	return ts, nil
 }
 
+// safetensorsPad returns the padded size of the safetensors file given a length n and offset s
 func safetensorsPad(n, s int64) int64 {
 	return 8 + n + s
 }
@@ -125,9 +126,9 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) {
 	}
 
 	switch st.Kind() {
-	case 0:
+	case tensorKindF32:
 		return 0, binary.Write(w, binary.LittleEndian, f32s)
-	case 1:
+	case tensorKindF16:
 		f16s := make([]uint16, len(f32s))
 		for i := range f32s {
 			f16s[i] = float16.Fromfloat32(f32s[i]).Bits()
diff --git a/convert/tokenizer.go b/convert/tokenizer.go
index baee04aa..43d8c14e 100644
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -32,7 +32,7 @@ type Tokenizer struct {
 	Template string
 }
 
-func parseTokenizer(d string, specialTypes []string) (*Tokenizer, error) {
+func parseTokenizer(d string, specialTokenTypes []string) (*Tokenizer, error) {
 	v, err := parseVocabulary(d)
 	if err != nil {
 		return nil, err
@@ -66,6 +66,8 @@ func parseTokenizer(d string, specialTypes []string) (*Tokenizer, error) {
 			switch pt.Type {
 			case "Split":
 				if pt.Pattern.Regex != "" {
+					// create a checksum of all Split pretokenizers which should be sufficient
+					// to identify the pretokenizer
 					sha256sum.Write([]byte(pt.Pattern.Regex))
 				}
 			}
@@ -102,7 +104,7 @@ func parseTokenizer(d string, specialTypes []string) (*Tokenizer, error) {
 			}
 		}
 
-		for _, st := range specialTypes {
+		for _, st := range specialTokenTypes {
 			sv := SpecialVocabulary{Type: st}
 			if bts, ok := p[fmt.Sprintf("add_%s_token", st)]; ok {
 				if err := json.Unmarshal(bts, &sv.AddToken); err != nil {
@@ -224,14 +226,13 @@ func parseVocabulary(d string) (*Vocabulary, error) {
 	}
 
 	for pattern, parseFn := range patterns {
-		matches, err := filepath.Glob(filepath.Join(d, pattern))
-		if err != nil {
+		if _, err := os.Stat(filepath.Join(d, pattern)); errors.Is(err, os.ErrNotExist) {
+			continue
+		} else if err != nil {
 			return nil, err
 		}
 
-		if len(matches) > 0 {
-			return parseFn(d)
-		}
+		return parseFn(d)
 	}
 
 	return nil, errors.New("unknown tensor format")
diff --git a/llm/gguf.go b/llm/gguf.go
index e61babf2..98158313 100644
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -489,6 +489,7 @@ func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
 	return a, nil
 }
 
+// writeGGUFArray writes a slice s of type E to the write with a gguf type of t
 func writeGGUFArray[S ~[]E, E any](w io.Writer, t uint32, s S) error {
 	if err := binary.Write(w, binary.LittleEndian, ggufTypeArray); err != nil {
 		return err
@@ -502,16 +503,10 @@ func writeGGUFArray[S ~[]E, E any](w io.Writer, t uint32, s S) error {
 		return err
 	}
 
-	for _, e := range s {
-		if err := binary.Write(w, binary.LittleEndian, e); err != nil {
-			return err
-		}
-	}
-
-	return nil
+	return binary.Write(w, binary.LittleEndian, s)
 }
 
-func WriteGGUF(ws io.WriteSeeker, kv KV, ts []*Tensor) error {
+func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
 	if err := binary.Write(ws, binary.LittleEndian, []byte("GGUF")); err != nil {
 		return err
 	}
@@ -537,7 +532,7 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []*Tensor) error {
 		}
 	}
 
-	slices.SortFunc(ts, func(a, b *Tensor) int {
+	slices.SortFunc(ts, func(a, b Tensor) int {
 		var i, j int
 		if n, err := fmt.Sscanf(a.Name, "blk.%d", &i); err != nil || n != 1 {
 			return cmp.Compare(a.Name, b.Name)
@@ -622,7 +617,7 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
 	return err
 }
 
-func ggufWriteTensorInfo(ws io.WriteSeeker, t *Tensor) error {
+func ggufWriteTensorInfo(ws io.WriteSeeker, t Tensor) error {
 	slog.Debug(t.Name, "kind", t.Kind, "shape", t.Shape, "offset", t.Offset)
 	if err := binary.Write(ws, binary.LittleEndian, uint64(len(t.Name))); err != nil {
 		return err
@@ -649,7 +644,7 @@ func ggufWriteTensorInfo(ws io.WriteSeeker, t *Tensor) error {
 	return binary.Write(ws, binary.LittleEndian, t.Offset)
 }
 
-func ggufWriteTensor(ws io.WriteSeeker, t *Tensor, alignment int64) error {
+func ggufWriteTensor(ws io.WriteSeeker, t Tensor, alignment int64) error {
 	offset, err := ws.Seek(0, io.SeekCurrent)
 	if err != nil {
 		return err
diff --git a/llm/memory_test.go b/llm/memory_test.go
index 18c797ee..3220c8df 100644
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -21,7 +21,7 @@ func TestEstimateGPULayers(t *testing.T) {
 	defer f.Close()
 	inputLayerCount := 5
 
-	tensors := []*Tensor{
+	tensors := []Tensor{
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
diff --git a/server/routes_create_test.go b/server/routes_create_test.go
index 4d616d8d..9b7009df 100644
--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@@ -19,7 +19,7 @@ import (
 
 var stream bool = false
 
-func createBinFile(t *testing.T, kv map[string]any, ti []*llm.Tensor) string {
+func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) string {
 	t.Helper()
 
 	f, err := os.CreateTemp(t.TempDir(), "")
diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go
index 02f95dd2..5c0caff1 100644
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -101,7 +101,7 @@ func TestGenerateChat(t *testing.T) {
 			"tokenizer.ggml.tokens":         []string{""},
 			"tokenizer.ggml.scores":         []float32{0},
 			"tokenizer.ggml.token_type":     []int32{0},
-		}, []*llm.Tensor{
+		}, []llm.Tensor{
 			{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
@@ -149,7 +149,7 @@ func TestGenerateChat(t *testing.T) {
 			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
 				"general.architecture": "bert",
 				"bert.pooling_type":    uint32(0),
-			}, []*llm.Tensor{})),
+			}, []llm.Tensor{})),
 			Stream: &stream,
 		})
 
@@ -399,7 +399,7 @@ func TestGenerate(t *testing.T) {
 			"tokenizer.ggml.tokens":         []string{""},
 			"tokenizer.ggml.scores":         []float32{0},
 			"tokenizer.ggml.token_type":     []int32{0},
-		}, []*llm.Tensor{
+		}, []llm.Tensor{
 			{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
@@ -447,7 +447,7 @@ func TestGenerate(t *testing.T) {
 			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
 				"general.architecture": "bert",
 				"bert.pooling_type":    uint32(0),
-			}, []*llm.Tensor{})),
+			}, []llm.Tensor{})),
 			Stream: &stream,
 		})
 
diff --git a/server/sched_test.go b/server/sched_test.go
index f3c55514..80395714 100644
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -124,7 +124,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 		"tokenizer.ggml.tokens":         []string{" "},
 		"tokenizer.ggml.scores":         []float32{0},
 		"tokenizer.ggml.token_type":     []int32{0},
-	}, []*llm.Tensor{
+	}, []llm.Tensor{
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 	}))

From 781fc2d5769bd1df7895dc2a18ab44830f6684fc Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 31 Jul 2024 10:58:22 -0700
Subject: [PATCH 65/79] Update convert/reader_safetensors.go

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>
---
 convert/reader_safetensors.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/convert/reader_safetensors.go b/convert/reader_safetensors.go
index d43c59a5..c5fe663c 100644
--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@@ -67,8 +67,8 @@ func parseSafetensors(ps ...string) ([]Tensor, error) {
 }
 
 // safetensorsPad returns the padded size of the safetensors file given a length n and offset s
-func safetensorsPad(n, s int64) int64 {
-	return 8 + n + s
+func safetensorsPad(n, offset int64) int64 {
+	return 8 + n + offset
 }
 
 type safetensor struct {

From eafc607abb3422a7d8e488aeb7a129a67a1f75c6 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Sat, 29 Jun 2024 16:53:59 -0700
Subject: [PATCH 66/79] convert: only extract large files

---
 convert/convert.go            |  11 ++--
 convert/convert_test.go       |   7 ++-
 convert/fs.go                 |  58 +++++++++++++++++++
 convert/reader.go             |  10 ++--
 convert/reader_safetensors.go |  20 +++++--
 convert/reader_torch.go       |   3 +-
 convert/tokenizer.go          |  22 ++++----
 convert/tokenizer_spm.go      |   8 +--
 server/model.go               |  79 ++++++--------------------
 server/model_test.go          | 102 ----------------------------------
 10 files changed, 120 insertions(+), 200 deletions(-)
 create mode 100644 convert/fs.go

diff --git a/convert/convert.go b/convert/convert.go
index 30c5a53f..b9461e4f 100644
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -5,9 +5,8 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"io/fs"
 	"log/slog"
-	"os"
-	"path/filepath"
 
 	"github.com/ollama/ollama/llm"
 )
@@ -67,8 +66,8 @@ type Converter interface {
 // and files it finds in the input path.
 // Supported input model formats include safetensors.
 // Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
-func Convert(path string, ws io.WriteSeeker) error {
-	bts, err := os.ReadFile(filepath.Join(path, "config.json"))
+func Convert(fsys fs.FS, ws io.WriteSeeker) error {
+	bts, err := fs.ReadFile(fsys, "config.json")
 	if err != nil {
 		return err
 	}
@@ -98,7 +97,7 @@ func Convert(path string, ws io.WriteSeeker) error {
 		return err
 	}
 
-	t, err := parseTokenizer(path, conv.specialTokenTypes())
+	t, err := parseTokenizer(fsys, conv.specialTokenTypes())
 	if err != nil {
 		return err
 	}
@@ -114,7 +113,7 @@ func Convert(path string, ws io.WriteSeeker) error {
 		slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
 	}
 
-	ts, err := parseTensors(path)
+	ts, err := parseTensors(fsys)
 	if err != nil {
 		return err
 	}
diff --git a/convert/convert_test.go b/convert/convert_test.go
index 0fbd436f..67a2fcfe 100644
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -6,6 +6,7 @@ import (
 	"flag"
 	"fmt"
 	"io"
+	"io/fs"
 	"log/slog"
 	"math"
 	"os"
@@ -17,7 +18,7 @@ import (
 	"golang.org/x/exp/maps"
 )
 
-func convertFull(t *testing.T, d string) (*os.File, llm.KV, llm.Tensors) {
+func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
 	t.Helper()
 
 	f, err := os.CreateTemp(t.TempDir(), "f16")
@@ -26,7 +27,7 @@ func convertFull(t *testing.T, d string) (*os.File, llm.KV, llm.Tensors) {
 	}
 	defer f.Close()
 
-	if err := Convert(d, f); err != nil {
+	if err := Convert(fsys, f); err != nil {
 		t.Fatal(err)
 	}
 
@@ -76,7 +77,7 @@ func TestConvertFull(t *testing.T) {
 				t.Skipf("%s not found", p)
 			}
 
-			f, kv, tensors := convertFull(t, p)
+			f, kv, tensors := convertFull(t, os.DirFS(p))
 			actual := make(map[string]string)
 			for k, v := range kv {
 				if s, ok := v.(json.Marshaler); !ok {
diff --git a/convert/fs.go b/convert/fs.go
new file mode 100644
index 00000000..bf6da6c2
--- /dev/null
+++ b/convert/fs.go
@@ -0,0 +1,58 @@
+package convert
+
+import (
+	"archive/zip"
+	"errors"
+	"io"
+	"io/fs"
+	"os"
+	"path/filepath"
+)
+
+type ZipReader struct {
+	r     *zip.Reader
+	p     string
+
+	// limit is the maximum size of a file that can be read directly
+	// from the zip archive. Files larger than this size will be extracted
+	limit int64
+}
+
+func NewZipReader(r *zip.Reader, p string, limit int64) fs.FS {
+	return &ZipReader{r, p, limit}
+}
+
+func (z *ZipReader) Open(name string) (fs.File, error) {
+	r, err := z.r.Open(name)
+	if err != nil {
+		return nil, err
+	}
+	defer r.Close()
+
+	if fi, err := r.Stat(); err != nil {
+		return nil, err
+	} else if fi.Size() < z.limit {
+		return r, nil
+	}
+
+	if !filepath.IsLocal(name) {
+		return nil, zip.ErrInsecurePath
+	}
+
+	n := filepath.Join(z.p, name)
+	if _, err := os.Stat(n); errors.Is(err, os.ErrNotExist) {
+		w, err := os.Create(n)
+		if err != nil {
+			return nil, err
+		}
+		defer w.Close()
+
+		if _, err := io.Copy(w, r); err != nil {
+			return nil, err
+		}
+	} else if err != nil {
+		return nil, err
+	}
+
+	return os.Open(n)
+}
diff --git a/convert/reader.go b/convert/reader.go
index 11ccaa81..56a8ae89 100644
--- a/convert/reader.go
+++ b/convert/reader.go
@@ -3,7 +3,7 @@ package convert
 import (
 	"errors"
 	"io"
-	"path/filepath"
+	"io/fs"
 	"strings"
 )
 
@@ -55,8 +55,8 @@ func (t *tensorBase) SetRepacker(fn repacker) {
 
 type repacker func(string, []float32, []uint64) ([]float32, error)
 
-func parseTensors(d string) ([]Tensor, error) {
-	patterns := map[string]func(...string) ([]Tensor, error){
+func parseTensors(fsys fs.FS) ([]Tensor, error) {
+	patterns := map[string]func(fs.FS, ...string) ([]Tensor, error){
 		"model-*-of-*.safetensors": parseSafetensors,
 		"model.safetensors":        parseSafetensors,
 		"pytorch_model-*-of-*.bin": parseTorch,
@@ -65,13 +65,13 @@ func parseTensors(d string) ([]Tensor, error) {
 	}
 
 	for pattern, parseFn := range patterns {
-		matches, err := filepath.Glob(filepath.Join(d, pattern))
+		matches, err := fs.Glob(fsys, pattern)
 		if err != nil {
 			return nil, err
 		}
 
 		if len(matches) > 0 {
-			return parseFn(matches...)
+			return parseFn(fsys, matches...)
 		}
 	}
 
diff --git a/convert/reader_safetensors.go b/convert/reader_safetensors.go
index c5fe663c..1c169504 100644
--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@@ -6,7 +6,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"io"
-	"os"
+	"io/fs"
 	"slices"
 
 	"github.com/d4l3k/go-bfloat16"
@@ -20,10 +20,10 @@ type safetensorMetadata struct {
 	Offsets []int64  `json:"data_offsets"`
 }
 
-func parseSafetensors(ps ...string) ([]Tensor, error) {
+func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) {
 	var ts []Tensor
 	for _, p := range ps {
-		f, err := os.Open(p)
+		f, err := fsys.Open(p)
 		if err != nil {
 			return nil, err
 		}
@@ -50,6 +50,7 @@ func parseSafetensors(ps ...string) ([]Tensor, error) {
 		for _, key := range keys {
 			if value := headers[key]; value.Type != "" {
 				ts = append(ts, safetensor{
+					fs:     fsys,
 					path:   p,
 					dtype:  value.Type,
 					offset: safetensorsPad(n, value.Offsets[0]),
@@ -72,6 +73,7 @@ func safetensorsPad(n, offset int64) int64 {
 }
 
 type safetensor struct {
+	fs     fs.FS
 	path   string
 	dtype  string
 	offset int64
@@ -80,14 +82,20 @@ type safetensor struct {
 }
 
 func (st safetensor) WriteTo(w io.Writer) (int64, error) {
-	f, err := os.Open(st.path)
+	f, err := st.fs.Open(st.path)
 	if err != nil {
 		return 0, err
 	}
 	defer f.Close()
 
-	if _, err = f.Seek(st.offset, io.SeekStart); err != nil {
-		return 0, err
+	if seeker, ok := f.(io.Seeker); ok {
+		if _, err := seeker.Seek(st.offset, io.SeekStart); err != nil {
+			return 0, err
+		}
+	} else {
+		if _, err := io.CopyN(io.Discard, f, st.offset); err != nil {
+			return 0, err
+		}
 	}
 
 	var f32s []float32
diff --git a/convert/reader_torch.go b/convert/reader_torch.go
index 1428706e..531996bf 100644
--- a/convert/reader_torch.go
+++ b/convert/reader_torch.go
@@ -2,12 +2,13 @@ package convert
 
 import (
 	"io"
+	"io/fs"
 
 	"github.com/nlpodyssey/gopickle/pytorch"
 	"github.com/nlpodyssey/gopickle/types"
 )
 
-func parseTorch(ps ...string) ([]Tensor, error) {
+func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) {
 	var ts []Tensor
 	for _, p := range ps {
 		pt, err := pytorch.Load(p)
diff --git a/convert/tokenizer.go b/convert/tokenizer.go
index 43d8c14e..cca40eb0 100644
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -7,9 +7,9 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
+	"io/fs"
 	"log/slog"
 	"os"
-	"path/filepath"
 	"slices"
 )
 
@@ -32,8 +32,8 @@ type Tokenizer struct {
 	Template string
 }
 
-func parseTokenizer(d string, specialTokenTypes []string) (*Tokenizer, error) {
-	v, err := parseVocabulary(d)
+func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error) {
+	v, err := parseVocabulary(fsys)
 	if err != nil {
 		return nil, err
 	}
@@ -44,7 +44,7 @@ func parseTokenizer(d string, specialTokenTypes []string) (*Tokenizer, error) {
 	}
 
 	addedTokens := make(map[string]token)
-	if f, err := os.Open(filepath.Join(d, "tokenizer.json")); errors.Is(err, os.ErrNotExist) {
+	if f, err := fsys.Open("tokenizer.json"); errors.Is(err, os.ErrNotExist) {
 	} else if err != nil {
 		return nil, err
 	} else {
@@ -87,7 +87,7 @@ func parseTokenizer(d string, specialTokenTypes []string) (*Tokenizer, error) {
 		}
 	}
 
-	if f, err := os.Open(filepath.Join(d, "tokenizer_config.json")); errors.Is(err, os.ErrNotExist) {
+	if f, err := fsys.Open("tokenizer_config.json"); errors.Is(err, os.ErrNotExist) {
 	} else if err != nil {
 		return nil, err
 	} else {
@@ -172,8 +172,8 @@ type Vocabulary struct {
 	Types  []int32
 }
 
-func parseVocabularyFromTokenizer(p string) (*Vocabulary, error) {
-	f, err := os.Open(filepath.Join(p, "tokenizer.json"))
+func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) {
+	f, err := fsys.Open("tokenizer.json")
 	if err != nil {
 		return nil, err
 	}
@@ -219,20 +219,20 @@ func parseVocabularyFromTokenizer(p string) (*Vocabulary, error) {
 	return &v, nil
 }
 
-func parseVocabulary(d string) (*Vocabulary, error) {
-	patterns := map[string]func(string) (*Vocabulary, error){
+func parseVocabulary(fsys fs.FS) (*Vocabulary, error) {
+	patterns := map[string]func(fs.FS) (*Vocabulary, error){
 		"tokenizer.model": parseSentencePiece,
 		"tokenizer.json":  parseVocabularyFromTokenizer,
 	}
 
 	for pattern, parseFn := range patterns {
-		if _, err := os.Stat(filepath.Join(d, pattern)); errors.Is(err, os.ErrNotExist) {
+		if _, err := fs.Stat(fsys, pattern); errors.Is(err, os.ErrNotExist) {
 			continue
 		} else if err != nil {
 			return nil, err
 		}
 
-		return parseFn(d)
+		return parseFn(fsys)
 	}
 
 	return nil, errors.New("unknown tensor format")
diff --git a/convert/tokenizer_spm.go b/convert/tokenizer_spm.go
index 75d9fe26..babf702c 100644
--- a/convert/tokenizer_spm.go
+++ b/convert/tokenizer_spm.go
@@ -5,8 +5,8 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
+	"io/fs"
 	"os"
-	"path/filepath"
 	"slices"
 
 	"google.golang.org/protobuf/proto"
@@ -14,8 +14,8 @@ import (
 	"github.com/ollama/ollama/convert/sentencepiece"
 )
 
-func parseSentencePiece(d string) (*Vocabulary, error) {
-	bts, err := os.ReadFile(filepath.Join(d, "tokenizer.model"))
+func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
+	bts, err := fs.ReadFile(fsys, "tokenizer.model")
 	if err != nil {
 		return nil, err
 	}
@@ -41,7 +41,7 @@ func parseSentencePiece(d string) (*Vocabulary, error) {
 		}
 	}
 
-	f, err := os.Open(filepath.Join(d, "added_tokens.json"))
+	f, err := fsys.Open("added_tokens.json")
 	if errors.Is(err, os.ErrNotExist) {
 		return &v, nil
 	} else if err != nil {
diff --git a/server/model.go b/server/model.go
index 81272a34..f2946a0b 100644
--- a/server/model.go
+++ b/server/model.go
@@ -81,88 +81,43 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
 	return layers, nil
 }
 
-func extractFromZipFile(p string, file *os.File, fn func(api.ProgressResponse)) error {
-	stat, err := file.Stat()
-	if err != nil {
-		return err
-	}
-
-	r, err := zip.NewReader(file, stat.Size())
-	if err != nil {
-		return err
-	}
-
-	fn(api.ProgressResponse{Status: "unpacking model metadata"})
-	for _, f := range r.File {
-		if !filepath.IsLocal(f.Name) {
-			return fmt.Errorf("%w: %s", zip.ErrInsecurePath, f.Name)
-		}
-
-		n := filepath.Join(p, f.Name)
-		if err := os.MkdirAll(filepath.Dir(n), 0o750); err != nil {
-			return err
-		}
-
-		// TODO(mxyng): this should not write out all files to disk
-		outfile, err := os.Create(n)
-		if err != nil {
-			return err
-		}
-		defer outfile.Close()
-
-		infile, err := f.Open()
-		if err != nil {
-			return err
-		}
-		defer infile.Close()
-
-		if _, err = io.Copy(outfile, infile); err != nil {
-			return err
-		}
-
-		if err := outfile.Close(); err != nil {
-			return err
-		}
-
-		if err := infile.Close(); err != nil {
-			return err
-		}
-	}
-
-	return nil
-}
-
-func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
-	tempDir, err := os.MkdirTemp(filepath.Dir(file.Name()), "")
+func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
+	fi, err := f.Stat()
 	if err != nil {
 		return nil, err
 	}
-	defer os.RemoveAll(tempDir)
 
-	if err := extractFromZipFile(tempDir, file, fn); err != nil {
+	r, err := zip.NewReader(f, fi.Size())
+	if err != nil {
 		return nil, err
 	}
 
+	p, err := os.MkdirTemp(filepath.Dir(f.Name()), "")
+	if err != nil {
+		return nil, err
+	}
+	defer os.RemoveAll(p)
+
 	fn(api.ProgressResponse{Status: "converting model"})
-
 	// TODO(mxyng): this should write directly into a layer
 	// e.g. NewLayer(arch.Reader(), "application/vnd.ollama.image.model")
-	temp, err := os.CreateTemp(tempDir, "fp16")
+	t, err := os.CreateTemp(p, "fp16")
 	if err != nil {
 		return nil, err
 	}
-	defer temp.Close()
-	defer os.Remove(temp.Name())
+	defer t.Close()
+	defer os.Remove(t.Name())
 
-	if err := convert.Convert(tempDir, temp); err != nil {
+	fn(api.ProgressResponse{Status: "converting model"})
+	if err := convert.Convert(convert.NewZipReader(r, p, 32<<20), t); err != nil {
 		return nil, err
 	}
 
-	if _, err := temp.Seek(0, io.SeekStart); err != nil {
+	if _, err := t.Seek(0, io.SeekStart); err != nil {
 		return nil, err
 	}
 
-	layer, err := NewLayer(temp, "application/vnd.ollama.image.model")
+	layer, err := NewLayer(t, "application/vnd.ollama.image.model")
 	if err != nil {
 		return nil, err
 	}
diff --git a/server/model_test.go b/server/model_test.go
index 5829adfc..0a2225d5 100644
--- a/server/model_test.go
+++ b/server/model_test.go
@@ -1,16 +1,11 @@
 package server
 
 import (
-	"archive/zip"
 	"bytes"
 	"encoding/json"
-	"errors"
 	"fmt"
-	"io"
 	"os"
 	"path/filepath"
-	"slices"
-	"strings"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
@@ -18,103 +13,6 @@ import (
 	"github.com/ollama/ollama/template"
 )
 
-func createZipFile(t *testing.T, name string) *os.File {
-	t.Helper()
-
-	f, err := os.CreateTemp(t.TempDir(), "")
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	zf := zip.NewWriter(f)
-	defer zf.Close()
-
-	zh, err := zf.CreateHeader(&zip.FileHeader{Name: name})
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if _, err := io.Copy(zh, bytes.NewReader([]byte(""))); err != nil {
-		t.Fatal(err)
-	}
-
-	return f
-}
-
-func TestExtractFromZipFile(t *testing.T) {
-	cases := []struct {
-		name   string
-		expect []string
-		err    error
-	}{
-		{
-			name:   "good",
-			expect: []string{"good"},
-		},
-		{
-			name:   strings.Join([]string{"path", "..", "to", "good"}, string(os.PathSeparator)),
-			expect: []string{filepath.Join("to", "good")},
-		},
-		{
-			name:   strings.Join([]string{"path", "..", "to", "..", "good"}, string(os.PathSeparator)),
-			expect: []string{"good"},
-		},
-		{
-			name:   strings.Join([]string{"path", "to", "..", "..", "good"}, string(os.PathSeparator)),
-			expect: []string{"good"},
-		},
-		{
-			name: strings.Join([]string{"..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "bad"}, string(os.PathSeparator)),
-			err:  zip.ErrInsecurePath,
-		},
-		{
-			name: strings.Join([]string{"path", "..", "..", "to", "bad"}, string(os.PathSeparator)),
-			err:  zip.ErrInsecurePath,
-		},
-	}
-
-	for _, tt := range cases {
-		t.Run(tt.name, func(t *testing.T) {
-			f := createZipFile(t, tt.name)
-			defer f.Close()
-
-			tempDir := t.TempDir()
-			if err := extractFromZipFile(tempDir, f, func(api.ProgressResponse) {}); !errors.Is(err, tt.err) {
-				t.Fatal(err)
-			}
-
-			var matches []string
-			if err := filepath.Walk(tempDir, func(p string, fi os.FileInfo, err error) error {
-				if err != nil {
-					return err
-				}
-
-				if !fi.IsDir() {
-					matches = append(matches, p)
-				}
-
-				return nil
-			}); err != nil {
-				t.Fatal(err)
-			}
-
-			var actual []string
-			for _, match := range matches {
-				rel, err := filepath.Rel(tempDir, match)
-				if err != nil {
-					t.Error(err)
-				}
-
-				actual = append(actual, rel)
-			}
-
-			if !slices.Equal(actual, tt.expect) {
-				t.Fatalf("expected %d files, got %d", len(tt.expect), len(matches))
-			}
-		})
-	}
-}
-
 func readFile(t *testing.T, base, name string) *bytes.Buffer {
 	t.Helper()
 

From d8e2664c33e81af0549aa9e75c57e08317d0322d Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 31 Jul 2024 15:39:11 -0700
Subject: [PATCH 67/79] convert: fix parse functions

---
 convert/reader.go    | 21 ++++++++++++---------
 convert/tokenizer.go | 15 +++++++++------
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/convert/reader.go b/convert/reader.go
index 56a8ae89..ce95208e 100644
--- a/convert/reader.go
+++ b/convert/reader.go
@@ -56,22 +56,25 @@ func (t *tensorBase) SetRepacker(fn repacker) {
 type repacker func(string, []float32, []uint64) ([]float32, error)
 
 func parseTensors(fsys fs.FS) ([]Tensor, error) {
-	patterns := map[string]func(fs.FS, ...string) ([]Tensor, error){
-		"model-*-of-*.safetensors": parseSafetensors,
-		"model.safetensors":        parseSafetensors,
-		"pytorch_model-*-of-*.bin": parseTorch,
-		"pytorch_model.bin":        parseTorch,
-		"consolidated.*.pth":       parseTorch,
+	patterns := []struct {
+		Pattern string
+		Func    func(fs.FS, ...string) ([]Tensor, error)
+	}{
+		{"model-*-of-*.safetensors", parseSafetensors},
+		{"model.safetensors", parseSafetensors},
+		{"pytorch_model-*-of-*.bin", parseTorch},
+		{"pytorch_model.bin", parseTorch},
+		{"consolidated.*.pth", parseTorch},
 	}
 
-	for pattern, parseFn := range patterns {
-		matches, err := fs.Glob(fsys, pattern)
+	for _, pattern := range patterns {
+		matches, err := fs.Glob(fsys, pattern.Pattern)
 		if err != nil {
 			return nil, err
 		}
 
 		if len(matches) > 0 {
-			return parseFn(fsys, matches...)
+			return pattern.Func(fsys, matches...)
 		}
 	}
 
diff --git a/convert/tokenizer.go b/convert/tokenizer.go
index cca40eb0..0d42a6d8 100644
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -220,19 +220,22 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) {
 }
 
 func parseVocabulary(fsys fs.FS) (*Vocabulary, error) {
-	patterns := map[string]func(fs.FS) (*Vocabulary, error){
-		"tokenizer.model": parseSentencePiece,
-		"tokenizer.json":  parseVocabularyFromTokenizer,
+	patterns := []struct {
+		Pattern string
+		Func    func(fs.FS) (*Vocabulary, error)
+	}{
+		{"tokenizer.model", parseSentencePiece},
+		{"tokenizer.json", parseVocabularyFromTokenizer},
 	}
 
-	for pattern, parseFn := range patterns {
-		if _, err := fs.Stat(fsys, pattern); errors.Is(err, os.ErrNotExist) {
+	for _, pattern := range patterns {
+		if _, err := fs.Stat(fsys, pattern.Pattern); errors.Is(err, os.ErrNotExist) {
 			continue
 		} else if err != nil {
 			return nil, err
 		}
 
-		return parseFn(fsys)
+		return pattern.Func(fsys)
 	}
 
 	return nil, errors.New("unknown tensor format")

From dc77bbcfa40dea8b8fc7713a2ecacbc6a9d08a25 Mon Sep 17 00:00:00 2001
From: Blake Mizerany <blake.mizerany@gmail.com>
Date: Wed, 31 Jul 2024 16:01:24 -0700
Subject: [PATCH 68/79] server: fix json marshalling of downloadBlobPart
 (#6108)

---
 server/download.go | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/server/download.go b/server/download.go
index 45483ba6..10074554 100644
--- a/server/download.go
+++ b/server/download.go
@@ -61,6 +61,36 @@ type blobDownloadPart struct {
 	*blobDownload `json:"-"`
 }
 
+type jsonBlobDownloadPart struct {
+	N         int
+	Offset    int64
+	Size      int64
+	Completed int64
+}
+
+func (p *blobDownloadPart) MarshalJSON() ([]byte, error) {
+	return json.Marshal(jsonBlobDownloadPart{
+		N:         p.N,
+		Offset:    p.Offset,
+		Size:      p.Size,
+		Completed: p.Completed.Load(),
+	})
+}
+
+func (p *blobDownloadPart) UnmarshalJSON(b []byte) error {
+	var j jsonBlobDownloadPart
+	if err := json.Unmarshal(b, &j); err != nil {
+		return err
+	}
+	*p = blobDownloadPart{
+		N:      j.N,
+		Offset: j.Offset,
+		Size:   j.Size,
+	}
+	p.Completed.Store(j.Completed)
+	return nil
+}
+
 const (
 	numDownloadParts          = 64
 	minDownloadPartSize int64 = 100 * format.MegaByte

From d87b4a488eaf6e52bc0ba170a803cbf4d63921cd Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 31 Jul 2024 16:52:09 -0700
Subject: [PATCH 69/79] fix modelfile message quotes

---
 server/images.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/images.go b/server/images.go
index 24675783..5f3eee88 100644
--- a/server/images.go
+++ b/server/images.go
@@ -184,7 +184,7 @@ func (m *Model) String() string {
 	for _, msg := range m.Messages {
 		modelfile.Commands = append(modelfile.Commands, parser.Command{
 			Name: "message",
-			Args: fmt.Sprintf("%s %s", msg.Role, msg.Content),
+			Args: fmt.Sprintf("%s: %s", msg.Role, msg.Content),
 		})
 	}
 

From 6bc5c137581cdd825627e7ec3da308843c94e162 Mon Sep 17 00:00:00 2001
From: Vyacheslav Moskalev <slouffka@gmail.com>
Date: Thu, 1 Aug 2024 15:45:41 +0700
Subject: [PATCH 70/79] Fix extra context concatenation in generate handler
 (#5980).

---
 server/routes.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/routes.go b/server/routes.go
index fb9b3091..b449136e 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -247,7 +247,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 						ch <- gin.H{"error": err.Error()}
 						return
 					}
-					res.Context = append(req.Context, tokens...)
+					res.Context = tokens
 				}
 			}
 

From 49a54831397d1723af4fbcd4f0c5c68dadbc54d5 Mon Sep 17 00:00:00 2001
From: Vyacheslav Moskalev <slouffka@gmail.com>
Date: Thu, 1 Aug 2024 19:25:56 +0700
Subject: [PATCH 71/79] Change the order of context and prompt.

---
 server/routes.go | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/server/routes.go b/server/routes.go
index b449136e..65ba22ea 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -188,21 +188,22 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		}
 
 		var b bytes.Buffer
-		if err := tmpl.Execute(&b, values); err != nil {
+		var t bytes.Buffer
+		if err := tmpl.Execute(&t, values); err != nil {
 			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 			return
 		}
 
 		if req.Context != nil {
-			s, err := r.Detokenize(c.Request.Context(), req.Context)
+			prev, err := r.Detokenize(c.Request.Context(), req.Context)
 			if err != nil {
 				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 				return
 			}
-
-			b.WriteString(s)
+			b.WriteString(prev)
 		}
 
+		b.WriteString(t.String());
 		prompt = b.String()
 	}
 
@@ -242,12 +243,12 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 				res.LoadDuration = checkpointLoaded.Sub(checkpointStart)
 
 				if !req.Raw {
-					tokens, err := r.Tokenize(c.Request.Context(), prompt+sb.String())
+					tokens, err := r.Tokenize(c.Request.Context(), prompt + sb.String())
 					if err != nil {
 						ch <- gin.H{"error": err.Error()}
 						return
 					}
-					res.Context = tokens
+					res.Context = tokens[:]
 				}
 			}
 

From b0c216584c82b47fa91323468a2c58e79f96f0bb Mon Sep 17 00:00:00 2001
From: Vyacheslav Moskalev <slouffka@gmail.com>
Date: Thu, 1 Aug 2024 19:43:44 +0700
Subject: [PATCH 72/79] Better types and naming closer to style.

---
 server/routes.go | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/server/routes.go b/server/routes.go
index 65ba22ea..0d397aaa 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -187,9 +187,9 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})
 		}
 
+		var s string
 		var b bytes.Buffer
-		var t bytes.Buffer
-		if err := tmpl.Execute(&t, values); err != nil {
+		if err := tmpl.Execute(&b, values); err != nil {
 			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 			return
 		}
@@ -200,11 +200,11 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 				return
 			}
-			b.WriteString(prev)
+			s += prev
 		}
 
-		b.WriteString(t.String());
-		prompt = b.String()
+		s += b.String();
+		prompt = s
 	}
 
 	slog.Debug("generate request", "prompt", prompt, "images", images)

From 3b5210548e957c5011233ae0e114131413362188 Mon Sep 17 00:00:00 2001
From: Vyacheslav Moskalev <slouffka@gmail.com>
Date: Thu, 1 Aug 2024 19:56:15 +0700
Subject: [PATCH 73/79] Refactor code. Remove extra variable.

---
 server/routes.go | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/server/routes.go b/server/routes.go
index 0d397aaa..8184db75 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -187,7 +187,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})
 		}
 
-		var s string
 		var b bytes.Buffer
 		if err := tmpl.Execute(&b, values); err != nil {
 			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
@@ -195,16 +194,15 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		}
 
 		if req.Context != nil {
-			prev, err := r.Detokenize(c.Request.Context(), req.Context)
+			s, err := r.Detokenize(c.Request.Context(), req.Context)
 			if err != nil {
 				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 				return
 			}
-			s += prev
+			prompt = s + b.String()
+		} else {
+			prompt = b.String();
 		}
-
-		s += b.String();
-		prompt = s
 	}
 
 	slog.Debug("generate request", "prompt", prompt, "images", images)

From 8a9f946ca76d04cfc964a3ffe5b919c0fb51915b Mon Sep 17 00:00:00 2001
From: Vyacheslav Moskalev <slouffka@gmail.com>
Date: Fri, 2 Aug 2024 03:50:05 +0700
Subject: [PATCH 74/79] Refactor and format code.

---
 server/routes.go | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/server/routes.go b/server/routes.go
index 8184db75..a745fb20 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -188,21 +188,21 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		}
 
 		var b bytes.Buffer
-		if err := tmpl.Execute(&b, values); err != nil {
-			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
-			return
-		}
-
 		if req.Context != nil {
 			s, err := r.Detokenize(c.Request.Context(), req.Context)
 			if err != nil {
 				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 				return
 			}
-			prompt = s + b.String()
-		} else {
-			prompt = b.String();
+			b.WriteString(s)
 		}
+
+		if err := tmpl.Execute(&b, values); err != nil {
+			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+			return
+		}
+
+		prompt = b.String()
 	}
 
 	slog.Debug("generate request", "prompt", prompt, "images", images)
@@ -241,12 +241,12 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 				res.LoadDuration = checkpointLoaded.Sub(checkpointStart)
 
 				if !req.Raw {
-					tokens, err := r.Tokenize(c.Request.Context(), prompt + sb.String())
+					tokens, err := r.Tokenize(c.Request.Context(), prompt+sb.String())
 					if err != nil {
 						ch <- gin.H{"error": err.Error()}
 						return
 					}
-					res.Context = tokens[:]
+					res.Context = tokens
 				}
 			}
 

From f561eecfb864536058ee73cdaca93de2a5c8dc5d Mon Sep 17 00:00:00 2001
From: royjhan <65097070+royjhan@users.noreply.github.com>
Date: Thu, 1 Aug 2024 18:48:44 -0400
Subject: [PATCH 75/79] Update OpenAI Compatibility Docs with /v1/models
 (#5151)

* OpenAI Docs

* Update docs/openai.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Remove newline

---------

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>
---
 docs/openai.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/docs/openai.md b/docs/openai.md
index fee30f71..701fbcdd 100644
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -27,6 +27,8 @@ chat_completion = client.chat.completions.create(
     ],
     model='llama3',
 )
+
+list_completion = client.models.list()
 ```
 
 ### OpenAI JavaScript library
@@ -45,6 +47,8 @@ const chatCompletion = await openai.chat.completions.create({
   messages: [{ role: 'user', content: 'Say this is a test' }],
   model: 'llama3',
 })
+
+const listCompletion = await openai.models.list()
 ```
 
 ### `curl`
@@ -66,6 +70,7 @@ curl http://localhost:11434/v1/chat/completions \
         ]
     }'
 
+curl http://localhost:11434/v1/models
 ```
 
 ## Endpoints
@@ -103,6 +108,13 @@ curl http://localhost:11434/v1/chat/completions \
 - [ ] `user`
 - [ ] `n`
 
+### `/v1/models`
+
+#### Notes
+
+- `created` corresponds to when the model was last modified
+- `owned_by` corresponds to the ollama username, defaulting to `"library"`
+
 ## Models
 
 Before using a model, pull it locally `ollama pull`:

From 6f133a0bdd1a768d7936f6bbc40d11af732eee6f Mon Sep 17 00:00:00 2001
From: royjhan <65097070+royjhan@users.noreply.github.com>
Date: Thu, 1 Aug 2024 18:49:37 -0400
Subject: [PATCH 76/79] OpenAI: Add Usage to `v1/embeddings` (#5886)

* add prompt tokens to embed response

* rm slog

* metrics

* types

* prompt n

* clean up

* reset submodule

* add tokens to v1/embeddings

* separate usage
---
 openai/openai.go | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/openai/openai.go b/openai/openai.go
index 5bd80660..e66d9416 100644
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -164,9 +164,15 @@ type ListCompletion struct {
 }
 
 type EmbeddingList struct {
-	Object string      `json:"object"`
-	Data   []Embedding `json:"data"`
-	Model  string      `json:"model"`
+	Object string         `json:"object"`
+	Data   []Embedding    `json:"data"`
+	Model  string         `json:"model"`
+	Usage  EmbeddingUsage `json:"usage,omitempty"`
+}
+
+type EmbeddingUsage struct {
+	PromptTokens int `json:"prompt_tokens"`
+	TotalTokens  int `json:"total_tokens"`
 }
 
 func NewError(code int, message string) ErrorResponse {
@@ -332,6 +338,10 @@ func toEmbeddingList(model string, r api.EmbedResponse) EmbeddingList {
 			Object: "list",
 			Data:   data,
 			Model:  model,
+			Usage: EmbeddingUsage{
+				PromptTokens: r.PromptEvalCount,
+				TotalTokens:  r.PromptEvalCount,
+			},
 		}
 	}
 

From ed52833bb129c15fb499ced542889a23a0c6d74e Mon Sep 17 00:00:00 2001
From: royjhan <65097070+royjhan@users.noreply.github.com>
Date: Thu, 1 Aug 2024 18:58:13 -0400
Subject: [PATCH 77/79] Add to docs (#5309)

---
 docs/openai.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/docs/openai.md b/docs/openai.md
index 701fbcdd..e4a4af1e 100644
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -29,6 +29,8 @@ chat_completion = client.chat.completions.create(
 )
 
 list_completion = client.models.list()
+
+model = client.models.retrieve("llama3")
 ```
 
 ### OpenAI JavaScript library
@@ -49,6 +51,8 @@ const chatCompletion = await openai.chat.completions.create({
 })
 
 const listCompletion = await openai.models.list()
+
+const model = await openai.models.retrieve("llama3");
 ```
 
 ### `curl`
@@ -71,6 +75,8 @@ curl http://localhost:11434/v1/chat/completions \
     }'
 
 curl http://localhost:11434/v1/models
+
+curl https://api.openai.com/v1/models/llama3
 ```
 
 ## Endpoints
@@ -115,6 +121,13 @@ curl http://localhost:11434/v1/models
 - `created` corresponds to when the model was last modified
 - `owned_by` corresponds to the ollama username, defaulting to `"library"`
 
+### `/v1/models/{model}`
+
+#### Notes
+
+- `created` corresponds to when the model was last modified
+- `owned_by` corresponds to the ollama username, defaulting to `"library"`
+
 ## Models
 
 Before using a model, pull it locally `ollama pull`:

From 558a54b098dc5044f5c4167ede5c327c970185a2 Mon Sep 17 00:00:00 2001
From: royjhan <65097070+royjhan@users.noreply.github.com>
Date: Thu, 1 Aug 2024 19:00:29 -0400
Subject: [PATCH 78/79] Update OpenAI Compatibility Docs with /v1/embeddings
 (#5470)

* docs without usage

* no usage

* rm metric note
---
 docs/openai.md | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/docs/openai.md b/docs/openai.md
index e4a4af1e..29195329 100644
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -31,6 +31,11 @@ chat_completion = client.chat.completions.create(
 list_completion = client.models.list()
 
 model = client.models.retrieve("llama3")
+
+embeddings = client.embeddings.create(
+    model="all-minilm",
+    input=["why is the sky blue?", "why is the grass green?"]
+)
 ```
 
 ### OpenAI JavaScript library
@@ -53,6 +58,11 @@ const chatCompletion = await openai.chat.completions.create({
 const listCompletion = await openai.models.list()
 
 const model = await openai.models.retrieve("llama3");
+
+const embedding = await openai.embeddings.create({
+  model: "all-minilm",
+  input: ["why is the sky blue?", "why is the grass green?"],
+});
 ```
 
 ### `curl`
@@ -77,6 +87,13 @@ curl http://localhost:11434/v1/chat/completions \
 curl http://localhost:11434/v1/models
 
 curl https://api.openai.com/v1/models/llama3
+
+curl http://localhost:11434/v1/embeddings \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "all-minilm",
+        "input": ["why is the sky blue?", "why is the grass green?"]
+    }'
 ```
 
 ## Endpoints
@@ -128,6 +145,20 @@ curl https://api.openai.com/v1/models/llama3
 - `created` corresponds to when the model was last modified
 - `owned_by` corresponds to the ollama username, defaulting to `"library"`
 
+### `/v1/embeddings`
+
+#### Supported request fields
+
+- [x] `model`
+- [x] `input`
+  - [x] string
+  - [x] array of strings
+  - [ ] array of tokens
+  - [ ] array of token arrays
+- [ ] `encoding format`
+- [ ] `dimensions`
+- [ ] `user`
+
 ## Models
 
 Before using a model, pull it locally `ollama pull`:

From ce1fb4447efc9958dcf279f7eb2ae6941bec1220 Mon Sep 17 00:00:00 2001
From: Kim Hallberg <hallberg.kim@gmail.com>
Date: Fri, 2 Aug 2024 01:31:47 +0200
Subject: [PATCH 79/79] Fix models/{model} URL (#6132)

---
 docs/openai.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/openai.md b/docs/openai.md
index 29195329..b4443cb0 100644
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -86,7 +86,7 @@ curl http://localhost:11434/v1/chat/completions \
 
 curl http://localhost:11434/v1/models
 
-curl https://api.openai.com/v1/models/llama3
+curl http://localhost:11434/v1/models/llama3
 
 curl http://localhost:11434/v1/embeddings \
     -H "Content-Type: application/json" \