Merge branch 'ollama:main' into cors

2024-09-05 13:12:20 +02:00 · 2024-09-05 13:12:20 +02:00 · ec486370fa
commit ec486370fa
parent 6fdf8235dd cf48603943
83 changed files with 1652 additions and 25416 deletions
--- a/.golangci.yaml
+++ b/.golangci.yaml
@ -32,6 +32,10 @@ linters:
 linters-settings:
  gci:
    sections: [standard, default, localmodule]
+  staticcheck:
+    checks:
+      - all
+      - -SA1019 # omit Deprecated check
 severity:
  default-severity: error
  rules:
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -18,7 +18,7 @@ See the [development documentation](./docs/development.md) for instructions on h

 * New features: new features (e.g. API fields, environment variables) add surface area to Ollama and make it harder to maintain in the long run as they cannot be removed without potentially breaking users in the future.
 * Refactoring: large code improvements are important, but can be harder or take longer to review and merge.
-* Documentation: small updates to fill in or dorrect missing documentation is helpful, however large documentation additions can be hard to maintain over time.
+* Documentation: small updates to fill in or correct missing documentation is helpful, however large documentation additions can be hard to maintain over time.

 ### Issues that may not be accepted

--- a/20
+++ b/20
@ -21,7 +21,7 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG CUDA_V11_ARCHITECTURES
-ENV GOARCH amd64 
+ENV GOARCH amd64
 RUN --mount=type=cache,target=/root/.ccache \
    OLLAMA_SKIP_STATIC_GENERATE=1 \
    OLLAMA_SKIP_CPU_GENERATE=1 \
@ -38,7 +38,7 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG CUDA_V12_ARCHITECTURES
-ENV GOARCH amd64 
+ENV GOARCH amd64
 RUN --mount=type=cache,target=/root/.ccache \
    OLLAMA_SKIP_STATIC_GENERATE=1 \
    OLLAMA_SKIP_CPU_GENERATE=1 \
@ -56,7 +56,7 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG CUDA_V11_ARCHITECTURES
-ENV GOARCH arm64 
+ENV GOARCH arm64
 RUN OLLAMA_SKIP_STATIC_GENERATE=1 \
    OLLAMA_SKIP_CPU_GENERATE=1 \
    CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
@ -72,7 +72,7 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG CUDA_V12_ARCHITECTURES
-ENV GOARCH arm64 
+ENV GOARCH arm64
 RUN --mount=type=cache,target=/root/.ccache \
    OLLAMA_SKIP_STATIC_GENERATE=1 \
    OLLAMA_SKIP_CPU_GENERATE=1 \
@ -92,7 +92,7 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG AMDGPU_TARGETS
-ENV GOARCH amd64 
+ENV GOARCH amd64
 RUN --mount=type=cache,target=/root/.ccache \
    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
 RUN mkdir -p ../../dist/linux-amd64-rocm/lib/ollama && \
@ -107,7 +107,7 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
-ENV GOARCH amd64 
+ENV GOARCH amd64
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate

 FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64
@ -181,17 +181,19 @@ RUN --mount=type=cache,target=/root/.ccache \
 # Strip out ROCm dependencies to keep the primary image lean
 FROM --platform=linux/amd64 ubuntu:22.04 as amd64-libs-without-rocm
 COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /scratch/
-RUN cd /scratch/ollama/ && rm -rf rocblas libamd* libdrm* libroc* libhip* libhsa* 
+RUN cd /scratch/ollama/ && rm -rf rocblas libamd* libdrm* libroc* libhip* libhsa*

 # Runtime stages
 FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64
 COPY --from=amd64-libs-without-rocm /scratch/ /lib/
-RUN apt-get update && apt-get install -y ca-certificates
+RUN apt-get update && apt-get install -y ca-certificates && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
 COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/

 FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64
 COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
-RUN apt-get update && apt-get install -y ca-certificates
+RUN apt-get update && apt-get install -y ca-certificates && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
 COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/

 # Radeon images are much larger so we keep it distinct from the CPU/CUDA image
--- a/README.md
+++ b/README.md
@ -296,12 +296,20 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
 - [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
 - [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
+- [Painting Droid](https://github.com/mateuszmigas/painting-droid) (Painting app with AI integrations)
 - [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
 - [AI Studio](https://github.com/MindWorkAI/AI-Studio)
 - [Sidellama](https://github.com/gyopak/sidellama) (browser-based LLM client)
 - [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
 - [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
 - [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend)
+- [Go-CREW](https://www.jonathanhecl.com/go-crew/) (Powerful Offline RAG in Golang)
+- [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)
+- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot and Ollama4j
+- [PyOllaMx](https://github.com/kspviswa/pyOllaMx) - macOS application capable of chatting with both Ollama and Apple MLX models.
+- [Claude Dev](https://github.com/saoudrizwan/claude-dev) - VSCode extension for multi-file/whole-repo coding
+- [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
+- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)

 ### Terminal

@ -327,6 +335,9 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [gollama](https://github.com/sammcj/gollama)
 - [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)

+### Apple Vision Pro
+- [Enchanted](https://github.com/AugustDev/enchanted)
+
 ### Database

 - [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md) (Connects Ollama models with nearly 200 data platforms and apps)
@ -337,6 +348,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/)
 - [Helm Chart](https://artifacthub.io/packages/helm/ollama-helm/ollama)
 - [Guix channel](https://codeberg.org/tusharhero/ollama-guix)
+- [Nix package](https://search.nixos.org/packages?channel=24.05&show=ollama&from=0&size=50&sort=relevance&type=packages&query=ollama)
+- [Flox](https://flox.dev/blog/ollama-part-one)

 ### Libraries

@ -347,11 +360,12 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
 - [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
 - [LiteLLM](https://github.com/BerriAI/litellm)
+- [OllamaFarm for Go](https://github.com/presbrey/ollamafarm)
 - [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
 - [Ollama for Ruby](https://github.com/gbaptista/ollama-ai)
 - [Ollama-rs for Rust](https://github.com/pepperoni21/ollama-rs)
 - [Ollama-hpp for C++](https://github.com/jmont-dev/ollama-hpp)
- [Ollama4j for Java](https://github.com/amithkoujalgi/ollama4j)
+- [Ollama4j for Java](https://github.com/ollama4j/ollama4j)
 - [ModelFusion Typescript Library](https://modelfusion.dev/integration/model-provider/ollama)
 - [OllamaKit for Swift](https://github.com/kevinhermawan/OllamaKit)
 - [Ollama for Dart](https://github.com/breitburg/dart-ollama)
@ -368,11 +382,16 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
 - [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama)
 - [LlamaScript](https://github.com/Project-Llama/llamascript)
+- [Gollm](https://docs.gollm.co/examples/ollama-example)
+- [Ollamaclient for Golang](https://github.com/xyproto/ollamaclient)
+- [High-level function abstraction in Go](https://gitlab.com/tozd/go/fun)
+- [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)

 ### Mobile

 - [Enchanted](https://github.com/AugustDev/enchanted)
 - [Maid](https://github.com/Mobile-Artificial-Intelligence/maid)
+- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)

 ### Extensions & Plugins

@ -402,6 +421,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
 - [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
 - [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
+- [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
+- [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)

 ### Supported backends

--- a/api/types.go
+++ b/api/types.go
@ -296,15 +296,17 @@ type EmbeddingResponse struct {
 // CreateRequest is the request passed to [Client.Create].
 type CreateRequest struct {
 	Model     string `json:"model"`
-	Path      string `json:"path"`
 	Modelfile string `json:"modelfile"`
 	Stream    *bool  `json:"stream,omitempty"`
 	Quantize  string `json:"quantize,omitempty"`

-	// Name is deprecated, see Model
+	// Deprecated: set the model name with Model instead
 	Name string `json:"name"`

-	// Quantization is deprecated, see Quantize
+	// Deprecated: set the file content with Modelfile instead
+	Path string `json:"path"`
+
+	// Deprecated: use Quantize instead
 	Quantization string `json:"quantization,omitempty"`
 }

@ -312,7 +314,7 @@ type CreateRequest struct {
 type DeleteRequest struct {
 	Model string `json:"model"`

-	// Name is deprecated, see Model
+	// Deprecated: set the model name with Model instead
 	Name string `json:"name"`
 }

@ -327,7 +329,7 @@ type ShowRequest struct {

 	Options map[string]interface{} `json:"options"`

-	// Name is deprecated, see Model
+	// Deprecated: set the model name with Model instead
 	Name string `json:"name"`
 }

@ -359,7 +361,7 @@ type PullRequest struct {
 	Password string `json:"password"`
 	Stream   *bool  `json:"stream,omitempty"`

-	// Name is deprecated, see Model
+	// Deprecated: set the model name with Model instead
 	Name string `json:"name"`
 }

@ -380,7 +382,7 @@ type PushRequest struct {
 	Password string `json:"password"`
 	Stream   *bool  `json:"stream,omitempty"`

-	// Name is deprecated, see Model
+	// Deprecated: set the model name with Model instead
 	Name string `json:"name"`
 }

--- a/app/ollama.iss
+++ b/app/ollama.iss
@ -87,7 +87,7 @@ DialogFontSize=12

 [Files]
 Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
-Source: "..\ollama.exe"; DestDir: "{app}\bin"; Flags: ignoreversion 64bit
+Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
 Source: "..\dist\windows-{#ARCH}\lib\ollama\runners\*"; DestDir: "{app}\lib\ollama\runners"; Flags: ignoreversion 64bit recursesubdirs
 Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
 Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
@ -99,7 +99,7 @@ Name: "{userstartup}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilen
 Name: "{userprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"

 [Run]
-Filename: "{cmd}"; Parameters: "/C set PATH={app}\bin;%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden
+Filename: "{cmd}"; Parameters: "/C set PATH={app};%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden

 [UninstallRun]
 ; Filename: "{cmd}"; Parameters: "/C ""taskkill /im ''{#MyAppExeName}'' /f /t"; Flags: runhidden
@ -134,8 +134,8 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi

 [Registry]
 Root: HKCU; Subkey: "Environment"; \
-    ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}\bin"; \
-    Check: NeedsAddPath('{app}\bin')
+    ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}"; \
+    Check: NeedsAddPath('{app}')

 [Code]

--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@ -204,6 +204,12 @@ func tempZipFiles(path string) (string, error) {
 		// safetensors files might be unresolved git lfs references; skip if they are
 		// covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors
 		files = append(files, st...)
+	} else if st, _ := glob(filepath.Join(path, "adapters.safetensors"), "application/octet-stream"); len(st) > 0 {
+		// covers adapters.safetensors
+		files = append(files, st...)
+	} else if st, _ := glob(filepath.Join(path, "adapter_model.safetensors"), "application/octet-stream"); len(st) > 0 {
+		// covers adapter_model.safetensors
+		files = append(files, st...)
 	} else if pt, _ := glob(filepath.Join(path, "pytorch_model*.bin"), "application/zip"); len(pt) > 0 {
 		// pytorch files might also be unresolved git lfs references; skip if they are
 		// covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin
@ -720,14 +726,17 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 }

 func showInfo(resp *api.ShowResponse) {
-	arch := resp.ModelInfo["general.architecture"].(string)
-
 	modelData := [][]string{
-		{"arch", arch},
 		{"parameters", resp.Details.ParameterSize},
 		{"quantization", resp.Details.QuantizationLevel},
-		{"context length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))},
-		{"embedding length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64))},
+	}
+	if resp.ModelInfo != nil {
+		arch := resp.ModelInfo["general.architecture"].(string)
+		modelData = append(modelData,
+			[]string{"arch", arch},
+			[]string{"context length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))},
+			[]string{"embedding length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64))},
+		)
 	}

 	mainTableData := [][]string{
--- a/convert/convert.go
+++ b/convert/convert.go
@ -12,12 +12,22 @@ import (
 	"github.com/ollama/ollama/llm"
 )

-type Parameters struct {
+type ModelParameters struct {
 	Architectures []string `json:"architectures"`
 	VocabSize     uint32   `json:"vocab_size"`
 }

-func (Parameters) KV(t *Tokenizer) llm.KV {
+type AdapterParameters struct {
+	Alpha          uint32 `json:"lora_alpha"`
+	LoraLayers     uint32 `json:"lora_layers"`
+	LoraParameters struct {
+		Rank  uint32  `json:"rank"`
+		Alpha float32 `json:"alpha"`
+		Scale float32 `json:"scale"`
+	} `json:"lora_parameters"`
+}
+
+func (ModelParameters) KV(t *Tokenizer) llm.KV {
 	kv := llm.KV{
 		"general.file_type":            uint32(1),
 		"general.quantization_version": uint32(2),
@ -44,17 +54,40 @@ func (Parameters) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (Parameters) specialTokenTypes() []string {
+func (p AdapterParameters) KV() llm.KV {
+	var alpha float32
+	if p.LoraParameters.Alpha == 0 {
+		alpha = float32(p.Alpha)
+	} else {
+		alpha = p.LoraParameters.Alpha
+	}
+
+	kv := llm.KV{
+		"adapter.lora.alpha": alpha,
+		"adapter.type":       "lora",
+		"general.file_type":  uint32(1),
+		"general.type":       "adapter",
+		"general.version":    "v0.2",
+	}
+
+	return kv
+}
+
+func (ModelParameters) specialTokenTypes() []string {
 	return []string{
 		"bos", "eos", "unk", "sep", "pad", "cls", "mask",
 	}
 }

-func (Parameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
+func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
 	return llm.WriteGGUF(ws, kv, ts)
 }

-type Converter interface {
+func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
+	return llm.WriteGGUF(ws, kv, ts)
+}
+
+type ModelConverter interface {
 	// KV maps parameters to LLM key-values
 	KV(*Tokenizer) llm.KV
 	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
@ -73,17 +106,67 @@ type moreParser interface {
 	parseMore(fs.FS) error
 }

+type AdapterConverter interface {
+	// KV maps parameters to LLM key-values
+	KV(llm.KV) llm.KV
+	// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
+	Tensors([]Tensor) []llm.Tensor
+	// Replacements returns a list of string pairs to replace in tensor names.
+	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
+	Replacements() []string
+
+	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
+}
+
+func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
+	bts, err := fs.ReadFile(fsys, "adapter_config.json")
+	if err != nil {
+		return err
+	}
+
+	var p AdapterParameters
+	if err := json.Unmarshal(bts, &p); err != nil {
+		return err
+	}
+
+	arch, ok := baseKV["general.architecture"]
+	if !ok {
+		return errors.New("architecture not set for the base model")
+	}
+
+	var conv AdapterConverter
+	switch arch {
+	case "llama":
+		conv = &llamaAdapter{}
+	case "gemma2":
+		conv = &gemma2Adapter{}
+	default:
+		return errors.New("unsupported architecture")
+	}
+
+	ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
+	if err != nil {
+		return err
+	}
+
+	if err := json.Unmarshal(bts, conv); err != nil {
+		return err
+	}
+
+	return conv.writeFile(ws, conv.KV(baseKV), conv.Tensors(ts))
+}
+
 // Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
 // and files it finds in the input path.
 // Supported input model formats include safetensors.
 // Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
-func Convert(fsys fs.FS, ws io.WriteSeeker) error {
+func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 	bts, err := fs.ReadFile(fsys, "config.json")
 	if err != nil {
 		return err
 	}

-	var p Parameters
+	var p ModelParameters
 	if err := json.Unmarshal(bts, &p); err != nil {
 		return err
 	}
@ -92,20 +175,20 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
 		return errors.New("unknown architecture")
 	}

-	var conv Converter
+	var conv ModelConverter
 	switch p.Architectures[0] {
 	case "LlamaForCausalLM", "MistralForCausalLM":
-		conv = &llama{}
+		conv = &llamaModel{}
 	case "MixtralForCausalLM":
-		conv = &mixtral{}
+		conv = &mixtralModel{}
 	case "GemmaForCausalLM":
-		conv = &gemma{}
+		conv = &gemmaModel{}
 	case "Gemma2ForCausalLM":
-		conv = &gemma2{}
+		conv = &gemma2Model{}
 	case "Phi3ForCausalLM":
-		conv = &phi3{}
+		conv = &phi3Model{}
 	case "BertModel":
-		conv = &bert{}
+		conv = &bertModel{}
 	default:
 		return errors.New("unsupported architecture")
 	}
--- a/convert/convert_bert.go
+++ b/convert/convert_bert.go
@ -11,8 +11,8 @@ import (
 	"github.com/ollama/ollama/llm"
 )

-type bert struct {
-	Parameters
+type bertModel struct {
+	ModelParameters
 	NLayers               uint32  `json:"n_layers"`
 	NumHiddenLayers       uint32  `json:"num_hidden_layers"`
 	NLayer                uint32  `json:"n_layer"`
@ -33,11 +33,11 @@ type bert struct {
 }

 var (
-	_ Converter  = (*bert)(nil)
-	_ moreParser = (*bert)(nil)
+	_ ModelConverter = (*bertModel)(nil)
+	_ moreParser     = (*bertModel)(nil)
 )

-func (p *bert) parseMore(fsys fs.FS) error {
+func (p *bertModel) parseMore(fsys fs.FS) error {
 	bts, err := fs.ReadFile(fsys, "modules.json")
 	if err != nil {
 		return err
@ -85,8 +85,8 @@ func (p *bert) parseMore(fsys fs.FS) error {
 	return nil
 }

-func (p *bert) KV(t *Tokenizer) llm.KV {
-	kv := p.Parameters.KV(t)
+func (p *bertModel) KV(t *Tokenizer) llm.KV {
+	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "bert"
 	kv["bert.attention.causal"] = false
 	kv["bert.pooling_type"] = p.PoolingType
@ -132,7 +132,7 @@ func (p *bert) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *bert) Tensors(ts []Tensor) []llm.Tensor {
+func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
 	var out []llm.Tensor
 	for _, t := range ts {
 		if slices.Contains([]string{
@ -154,7 +154,7 @@ func (p *bert) Tensors(ts []Tensor) []llm.Tensor {
 	return out
 }

-func (bert) Replacements() []string {
+func (bertModel) Replacements() []string {
 	return []string{
 		"encoder.layer", "blk",
 		"encoder.layers", "blk",
--- a/convert/convert_gemma.go
+++ b/convert/convert_gemma.go
@ -9,8 +9,8 @@ import (
 	"github.com/ollama/ollama/llm"
 )

-type gemma struct {
-	Parameters
+type gemmaModel struct {
+	ModelParameters
 	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
 	HiddenSize            uint32  `json:"hidden_size"`
 	HiddenLayers          uint32  `json:"num_hidden_layers"`
@ -21,10 +21,10 @@ type gemma struct {
 	HeadDim               uint32  `json:"head_dim"`
 }

-var _ Converter = (*gemma)(nil)
+var _ ModelConverter = (*gemmaModel)(nil)

-func (p *gemma) KV(t *Tokenizer) llm.KV {
-	kv := p.Parameters.KV(t)
+func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
+	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma"
 	kv["gemma.context_length"] = p.MaxPositionEmbeddings
 	kv["gemma.embedding_length"] = p.HiddenSize
@ -42,8 +42,8 @@ func (p *gemma) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *gemma) Tensors(ts []Tensor) []llm.Tensor {
-	out := make([]llm.Tensor, 0, len(ts))
+func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
+	var out []llm.Tensor
 	for _, t := range ts {
 		if strings.HasSuffix(t.Name(), "_norm.weight") {
 			t.SetRepacker(p.addOne)
@ -60,7 +60,7 @@ func (p *gemma) Tensors(ts []Tensor) []llm.Tensor {
 	return out
 }

-func (p *gemma) Replacements() []string {
+func (p *gemmaModel) Replacements() []string {
 	return []string{
 		"model.embed_tokens", "token_embd",
 		"model.norm", "output_norm",
@ -77,7 +77,7 @@ func (p *gemma) Replacements() []string {
 	}
 }

-func (*gemma) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
+func (*gemmaModel) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
 	n := tensor.New(tensor.WithShape(int(shape[0])), tensor.WithBacking(data))
 	ones := tensor.Ones(tensor.Float32, int(shape[0]))

--- a/convert/convert_gemma2.go
+++ b/convert/convert_gemma2.go
@ -4,15 +4,15 @@ import (
 	"github.com/ollama/ollama/llm"
 )

-type gemma2 struct {
-	gemma
+type gemma2Model struct {
+	gemmaModel
 	SlidingWindow         uint32  `json:"sliding_window"`
 	AttentionLogitSoftcap float32 `json:"attn_logit_softcapping"`
 	FinalLogitSoftcap     float32 `json:"final_logit_softcapping"`
 }

-func (p *gemma2) KV(t *Tokenizer) llm.KV {
-	kv := p.Parameters.KV(t)
+func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
+	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma2"
 	kv["gemma2.context_length"] = p.MaxPositionEmbeddings
 	kv["gemma2.embedding_length"] = p.HiddenSize
@ -33,9 +33,9 @@ func (p *gemma2) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *gemma2) Replacements() []string {
+func (p *gemma2Model) Replacements() []string {
 	return append(
-		p.gemma.Replacements(),
+		p.gemmaModel.Replacements(),
 		"post_attention_layernorm", "post_attention_norm",
 		"pre_feedforward_layernorm", "ffn_norm",
 		"post_feedforward_layernorm", "post_ffw_norm",
--- a/convert/convert_gemma2_adapter.go
+++ b/convert/convert_gemma2_adapter.go
@ -0,0 +1,91 @@
+package convert
+
+import (
+	"strings"
+
+	"github.com/pdevine/tensor"
+	"github.com/pdevine/tensor/native"
+
+	"github.com/ollama/ollama/llm"
+)
+
+type gemma2Adapter struct {
+	AdapterParameters
+}
+
+var _ AdapterConverter = (*gemma2Adapter)(nil)
+
+func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV {
+	kv := p.AdapterParameters.KV()
+	kv["general.architecture"] = "gemma2"
+	return kv
+}
+
+func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
+	var out []llm.Tensor
+	for _, t := range ts {
+		shape := t.Shape()
+		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
+			(strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) {
+			shape[0], shape[1] = shape[1], shape[0]
+			t.SetRepacker(p.repack)
+		}
+
+		out = append(out, llm.Tensor{
+			Name:     t.Name(),
+			Kind:     t.Kind(),
+			Shape:    t.Shape(),
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (p *gemma2Adapter) Replacements() []string {
+	return []string{
+		"base_model.model.", "",
+		"model.layers", "blk",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.o_proj", "attn_output",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.down_proj", "ffn_down",
+		"mlp.up_proj", "ffn_up",
+		"lora_A.weight", "weight.lora_a",
+		"lora_B.weight", "weight.lora_b",
+		"lora_a", "weight.lora_a",
+		"lora_b", "weight.lora_b",
+	}
+}
+
+func (p *gemma2Adapter) repack(name string, data []float32, shape []uint64) ([]float32, error) {
+	dims := []int{int(shape[1]), int(shape[0])}
+
+	n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+
+	if err := n.T(1, 0); err != nil {
+		return nil, err
+	}
+
+	if err := n.Reshape(dims...); err != nil {
+		return nil, err
+	}
+
+	if err := n.Transpose(); err != nil {
+		return nil, err
+	}
+
+	ts, err := native.SelectF32(n, 1)
+	if err != nil {
+		return nil, err
+	}
+
+	var f32s []float32
+	for _, t := range ts {
+		f32s = append(f32s, t...)
+	}
+
+	return f32s, nil
+}
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@ -12,8 +12,8 @@ import (
 	"github.com/ollama/ollama/llm"
 )

-type llama struct {
-	Parameters
+type llamaModel struct {
+	ModelParameters
 	NLayers               uint32  `json:"n_layers"`
 	NumHiddenLayers       uint32  `json:"num_hidden_layers"`
 	NLayer                uint32  `json:"n_layer"`
@ -44,10 +44,10 @@ type llama struct {
 	HeadDim          uint32  `json:"head_dim"`
 }

-var _ Converter = (*llama)(nil)
+var _ ModelConverter = (*llamaModel)(nil)

-func (p *llama) KV(t *Tokenizer) llm.KV {
-	kv := p.Parameters.KV(t)
+func (p *llamaModel) KV(t *Tokenizer) llm.KV {
+	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "llama"
 	kv["llama.vocab_size"] = p.VocabSize

@ -120,7 +120,7 @@ func (p *llama) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
+func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
 	var out []llm.Tensor

 	if p.RopeScaling.factors != nil {
@ -149,7 +149,7 @@ func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
 	return out
 }

-func (p *llama) Replacements() []string {
+func (p *llamaModel) Replacements() []string {
 	return []string{
 		"lm_head", "output",
 		"model.embed_tokens", "token_embd",
@ -167,7 +167,7 @@ func (p *llama) Replacements() []string {
 	}
 }

-func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32, error) {
+func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]float32, error) {
 	var dims []int
 	for _, dim := range shape {
 		dims = append(dims, int(dim))
--- a/convert/convert_llama_adapter.go
+++ b/convert/convert_llama_adapter.go
@ -0,0 +1,169 @@
+package convert
+
+import (
+	"cmp"
+	"strings"
+
+	"github.com/pdevine/tensor"
+	"github.com/pdevine/tensor/native"
+
+	"github.com/ollama/ollama/llm"
+)
+
+type llamaAdapter struct {
+	AdapterParameters
+	NumAttentionHeads uint32 `json:"num_attention_heads"`
+	NumKeyValueHeads  uint32 `json:"num_key_value_heads"`
+}
+
+var _ AdapterConverter = (*llamaAdapter)(nil)
+
+func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
+	kv := p.AdapterParameters.KV()
+	kv["general.architecture"] = "llama"
+	kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
+	kv["llama.attention.head_count_kv"] = baseKV["llama.attention.head_count_kv"]
+
+	p.NumAttentionHeads = baseKV["llama.attention.head_count"].(uint32)
+
+	return kv
+}
+
+func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
+	var out []llm.Tensor
+	for _, t := range ts {
+		shape := t.Shape()
+		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
+			(strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) {
+			shape[0], shape[1] = shape[1], shape[0]
+			t.SetRepacker(p.repackAndTranspose)
+		} else {
+			t.SetRepacker(p.repack)
+		}
+
+		out = append(out, llm.Tensor{
+			Name:     t.Name(),
+			Kind:     t.Kind(),
+			Shape:    shape,
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (p *llamaAdapter) Replacements() []string {
+	return []string{
+		"base_model.model.", "",
+		"model.layers", "blk",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.o_proj", "attn_output",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.down_proj", "ffn_down",
+		"mlp.up_proj", "ffn_up",
+		"lora_A.weight", "weight.lora_a",
+		"lora_B.weight", "weight.lora_b",
+		"lora_a", "weight.lora_a",
+		"lora_b", "weight.lora_b",
+	}
+}
+
+func (p *llamaAdapter) repack(name string, data []float32, shape []uint64) ([]float32, error) {
+	dims := []int{int(shape[1]), int(shape[0])}
+
+	var heads uint32
+	if strings.HasSuffix(name, "attn_q.weight.lora_a") {
+		heads = p.NumAttentionHeads
+	} else if strings.HasSuffix(name, "attn_k.weight.lora_a") {
+		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
+	} else {
+		return data, nil
+	}
+
+	n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+
+	if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
+		return nil, err
+	}
+
+	if err := n.T(0, 2, 1, 3); err != nil {
+		return nil, err
+	}
+
+	if err := n.Reshape(dims...); err != nil {
+		return nil, err
+	}
+
+	if err := n.Transpose(); err != nil {
+		return nil, err
+	}
+
+	ts, err := native.SelectF32(n, 1)
+	if err != nil {
+		return nil, err
+	}
+
+	var f32s []float32
+	for _, t := range ts {
+		f32s = append(f32s, t...)
+	}
+
+	return f32s, nil
+}
+
+func (p *llamaAdapter) repackAndTranspose(name string, data []float32, shape []uint64) ([]float32, error) {
+	dims := []int{int(shape[1]), int(shape[0])}
+
+	n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+
+	var heads uint32
+	if strings.HasSuffix(name, "attn_q.weight.lora_a") {
+		heads = p.NumAttentionHeads
+	} else if strings.HasSuffix(name, "attn_k.weight.lora_a") {
+		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
+	}
+
+	if heads > 0 {
+		if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
+			return nil, err
+		}
+
+		if err := n.T(0, 2, 1, 3); err != nil {
+			return nil, err
+		}
+
+		if err := n.Reshape(dims...); err != nil {
+			return nil, err
+		}
+
+		if err := n.Transpose(); err != nil {
+			return nil, err
+		}
+	}
+
+	if err := n.T(1, 0); err != nil {
+		return nil, err
+	}
+
+	if err := n.Reshape(dims...); err != nil {
+		return nil, err
+	}
+
+	if err := n.Transpose(); err != nil {
+		return nil, err
+	}
+
+	ts, err := native.SelectF32(n, 1)
+	if err != nil {
+		return nil, err
+	}
+
+	var f32s []float32
+	for _, t := range ts {
+		f32s = append(f32s, t...)
+	}
+
+	return f32s, nil
+}
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@ -9,14 +9,14 @@ import (
 	"github.com/ollama/ollama/llm"
 )

-type mixtral struct {
-	llama
+type mixtralModel struct {
+	llamaModel
 	NumLocalExperts    uint32 `json:"num_local_experts"`
 	NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
 }

-func (p *mixtral) KV(t *Tokenizer) llm.KV {
-	kv := p.llama.KV(t)
+func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
+	kv := p.llamaModel.KV(t)

 	if p.NumLocalExperts > 0 {
 		kv["llama.expert_count"] = p.NumLocalExperts
@ -29,7 +29,7 @@ func (p *mixtral) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor {
+func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
 	oldnew := []string{
 		"model.layers", "blk",
 		"w1", "ffn_gate_exps",
@ -67,12 +67,12 @@ func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor {
 		})
 	}

-	return append(out, p.llama.Tensors(ts)...)
+	return append(out, p.llamaModel.Tensors(ts)...)
 }

-func (p *mixtral) Replacements() []string {
+func (p *mixtralModel) Replacements() []string {
 	return append(
-		p.llama.Replacements(),
+		p.llamaModel.Replacements(),
 		"block_sparse_moe.gate", "ffn_gate_inp",
 	)
 }
--- a/convert/convert_phi3.go
+++ b/convert/convert_phi3.go
@ -11,8 +11,8 @@ import (
 	"github.com/ollama/ollama/llm"
 )

-type phi3 struct {
-	Parameters
+type phi3Model struct {
+	ModelParameters
 	NumHiddenLayers   uint32  `json:"num_hidden_layers"`
 	NLayers           uint32  `json:"n_layers"`
 	HiddenSize        uint32  `json:"hidden_size"`
@ -35,10 +35,10 @@ type phi3 struct {
 	SlidingWindow                 uint32  `json:"sliding_window"`
 }

-var _ Converter = (*phi3)(nil)
+var _ ModelConverter = (*phi3Model)(nil)

-func (p *phi3) KV(t *Tokenizer) llm.KV {
-	kv := p.Parameters.KV(t)
+func (p *phi3Model) KV(t *Tokenizer) llm.KV {
+	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "phi3"
 	kv["phi3.context_length"] = p.MaxPositionEmbeddings
 	kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
@ -68,7 +68,7 @@ func (p *phi3) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
+func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
 	var addRopeFactors sync.Once

 	out := make([]llm.Tensor, 0, len(ts)+2)
@ -100,7 +100,7 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
 	return out
 }

-func (p *phi3) Replacements() []string {
+func (p *phi3Model) Replacements() []string {
 	return []string{
 		"lm_head", "output",
 		"model.embed_tokens", "token_embd",
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@ -1,7 +1,9 @@
 package convert

 import (
+	"bytes"
 	"crypto/sha256"
+	"encoding/binary"
 	"encoding/hex"
 	"encoding/json"
 	"flag"
@ -29,7 +31,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
 	}
 	defer f.Close()

-	if err := Convert(fsys, f); err != nil {
+	if err := ConvertModel(fsys, f); err != nil {
 		t.Fatal(err)
 	}

@ -51,6 +53,34 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
 	return r, m.KV(), m.Tensors()
 }

+func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors llm.Tensors) map[string]string {
+	actual := make(map[string]string)
+	for k, v := range kv {
+		if s, ok := v.(json.Marshaler); !ok {
+			actual[k] = fmt.Sprintf("%v", v)
+		} else {
+			bts, err := json.Marshal(s)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			actual[k] = fmt.Sprintf("%x", sha256.Sum256(bts))
+		}
+	}
+
+	for _, tensor := range tensors.Items {
+		sha256sum := sha256.New()
+		sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size()))
+		if _, err := io.Copy(sha256sum, sr); err != nil {
+			t.Fatal(err)
+		}
+
+		actual[tensor.Name] = hex.EncodeToString(sha256sum.Sum(nil))
+	}
+
+	return actual
+}
+
 func TestMain(m *testing.M) {
 	var level slog.Level
 	flag.TextVar(&level, "level", slog.LevelInfo, "log level")
@ -59,7 +89,7 @@ func TestMain(m *testing.M) {
 	os.Exit(m.Run())
 }

-func TestConvertFull(t *testing.T) {
+func TestConvertModel(t *testing.T) {
 	cases := []string{
 		"Meta-Llama-3-8B-Instruct",
 		"Meta-Llama-3.1-8B-Instruct",
@ -85,29 +115,7 @@ func TestConvertFull(t *testing.T) {
 			}

 			f, kv, tensors := convertFull(t, os.DirFS(p))
-			actual := make(map[string]string)
-			for k, v := range kv {
-				if s, ok := v.(json.Marshaler); !ok {
-					actual[k] = fmt.Sprintf("%v", v)
-				} else {
-					bts, err := json.Marshal(s)
-					if err != nil {
-						t.Fatal(err)
-					}
-
-					actual[k] = fmt.Sprintf("%x", sha256.Sum256(bts))
-				}
-			}
-
-			for _, tensor := range tensors.Items {
-				sha256sum := sha256.New()
-				sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size()))
-				if _, err := io.Copy(sha256sum, sr); err != nil {
-					t.Fatal(err)
-				}
-
-				actual[tensor.Name] = hex.EncodeToString(sha256sum.Sum(nil))
-			}
+			actual := generateResultsJSON(t, f, kv, tensors)

 			expectFile, err := os.Open(filepath.Join("testdata", fmt.Sprintf("%s.json", tt)))
 			if err != nil {
@ -131,3 +139,310 @@ func TestConvertFull(t *testing.T) {
 		})
 	}
 }
+
+func TestConvertInvalidDatatype(t *testing.T) {
+	f, err := os.CreateTemp(t.TempDir(), "testmodel")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f.Close()
+
+	tempDir := t.TempDir()
+	generateSafetensorTestData(t, tempDir)
+
+	err = ConvertModel(os.DirFS(tempDir), f)
+	if err == nil || err.Error() != "unsupported safetensors model" {
+		t.Errorf("expected error but didn't get one")
+	}
+}
+
+func generateSafetensorTestData(t *testing.T, tempDir string) {
+	type tensorData struct {
+		Offsets []int  `json:"data_offsets"`
+		Type    string `json:"dtype"`
+		Shape   []int  `json:"shape"`
+	}
+	offset := 4096 * 14336
+
+	td := map[string]*tensorData{}
+	td["model.layers.0.mlp.down_proj.weight"] = &tensorData{
+		Offsets: []int{0, offset},
+		Type:    "I8",
+		Shape:   []int{4096, 14336},
+	}
+	td["model.layers.0.mlp.down_proj.weight_format"] = &tensorData{
+		Offsets: []int{offset, offset},
+		Type:    "U8",
+		Shape:   []int{},
+	}
+
+	data, err := json.Marshal(td)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	var buf bytes.Buffer
+
+	l := int64(len(data))
+	err = binary.Write(&buf, binary.LittleEndian, l)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	_, err = buf.Write(data)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	fdata, err := os.Create(filepath.Join(tempDir, "model-00001-of-00001.safetensors"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer fdata.Close()
+
+	_, err = fdata.Write(buf.Bytes())
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	configData := `
+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ]
+}
+`
+
+	f, err := os.Create(filepath.Join(tempDir, "config.json"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f.Close()
+
+	_, err = f.WriteString(configData)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	tokenizerData := `
+{
+}
+`
+
+	f, err = os.Create(filepath.Join(tempDir, "tokenizer.json"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f.Close()
+
+	_, err = f.WriteString(tokenizerData)
+	if err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestConvertAdapter(t *testing.T) {
+	type AdapterCase struct {
+		Name     string
+		BaseKV   map[string]any
+		Expected map[string]string
+	}
+
+	cases := []AdapterCase{
+		{
+			Name: "discollama",
+			BaseKV: map[string]any{
+				"general.architecture":          "llama",
+				"llama.attention.head_count":    uint32(32),
+				"llama.attention.head_count_kv": uint32(8),
+			},
+			Expected: map[string]string{
+				"general.architecture":          "llama",
+				"general.file_type":             "1",
+				"general.parameter_count":       "106496",
+				"general.type":                  "adapter",
+				"general.version":               "v0.2",
+				"adapter.lora.alpha":            "16",
+				"adapter.type":                  "lora",
+				"llama.attention.head_count":    "32",
+				"llama.attention.head_count_kv": "8",
+				"blk.31.attn_q.weight.lora_a":   "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
+				"blk.31.attn_q.weight.lora_b":   "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
+				"blk.31.attn_v.weight.lora_a":   "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
+				"blk.31.attn_v.weight.lora_b":   "071dcafe89df065d6e1c935ecb8fdf6479b3c202eb912e7da938597673ff5857",
+			},
+		},
+	}
+
+	for _, c := range cases {
+		t.Run(c.Name, func(t *testing.T) {
+			t.Parallel()
+
+			f, err := os.CreateTemp(t.TempDir(), "f16")
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer f.Close()
+
+			tempDir := t.TempDir()
+			generateLoraTestData(t, tempDir)
+
+			if err = ConvertAdapter(os.DirFS(tempDir), f, c.BaseKV); err != nil {
+				t.Fatal(err)
+			}
+
+			r, err := os.Open(f.Name())
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer r.Close()
+
+			m, _, err := llm.DecodeGGML(r, math.MaxInt)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			if _, err := r.Seek(0, io.SeekStart); err != nil {
+				t.Fatal(err)
+			}
+
+			actual := generateResultsJSON(t, r, m.KV(), m.Tensors())
+
+			keys := maps.Keys(c.Expected)
+			slices.Sort(keys)
+			for _, k := range keys {
+				if v, ok := actual[k]; !ok {
+					t.Errorf("missing %s", k)
+				} else if v != c.Expected[k] {
+					t.Errorf("unexpected %s: want %s, got %s", k, c.Expected[k], v)
+				}
+			}
+		})
+	}
+}
+
+func generateLoraTestData(t *testing.T, tempDir string) {
+	type tensorData struct {
+		Offsets []int  `json:"data_offsets"`
+		Type    string `json:"dtype"`
+		Shape   []int  `json:"shape"`
+	}
+	offset := 4096 * 8 * 4
+
+	td := map[string]*tensorData{"__metadata__": nil}
+	td["model.layers.31.self_attn.q_proj.lora_a"] = &tensorData{
+		Offsets: []int{0, offset},
+		Type:    "F32",
+		Shape:   []int{4096, 8},
+	}
+	td["model.layers.31.self_attn.q_proj.lora_b"] = &tensorData{
+		Offsets: []int{offset, offset * 2},
+		Type:    "F32",
+		Shape:   []int{8, 4096},
+	}
+	td["model.layers.31.self_attn.v_proj.lora_a"] = &tensorData{
+		Offsets: []int{offset * 2, offset * 3},
+		Type:    "F32",
+		Shape:   []int{4096, 8},
+	}
+	td["model.layers.31.self_attn.v_proj.lora_b"] = &tensorData{
+		Offsets: []int{offset * 3, offset*3 + 8*1024*4},
+		Type:    "F32",
+		Shape:   []int{8, 1024},
+	}
+
+	data, err := json.Marshal(td)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	var buf bytes.Buffer
+
+	l := int64(len(data))
+	err = binary.Write(&buf, binary.LittleEndian, l)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	_, err = buf.Write(data)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// write some data for the tensors
+
+	ones := make([]float32, 4096*8)
+	for i := range ones {
+		ones[i] = float32(1)
+	}
+
+	for range 3 {
+		err = binary.Write(&buf, binary.LittleEndian, ones)
+		if err != nil {
+			t.Fatal(err)
+		}
+	}
+
+	ones = make([]float32, 1024*8)
+	for i := range ones {
+		ones[i] = float32(1)
+	}
+
+	err = binary.Write(&buf, binary.LittleEndian, ones)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	fdata, err := os.Create(filepath.Join(tempDir, "adapters.safetensors"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer fdata.Close()
+
+	_, err = fdata.Write(buf.Bytes())
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	configData := `
+{
+    "adapter_path": "adapters-test",
+    "batch_size": 8,
+    "config": "config-tiny.json",
+    "data": "../discollama-completion",
+    "grad_checkpoint": null,
+    "iters": 1000,
+    "learning_rate": 1e-05,
+    "lora_layers": 1,
+    "lora_parameters": {
+        "rank": 8,
+        "alpha": 16,
+        "dropout": 0.0,
+        "scale": 2.0
+    },
+    "lr_schedule": null,
+    "max_seq_length": 2048,
+    "model": "/Users/pdevine/git/Meta-Llama-3-8B-Instruct",
+    "resume_adapter_file": null,
+    "save_every": 100,
+    "seed": 0,
+    "steps_per_eval": 200,
+    "steps_per_report": 10,
+    "test": false,
+    "test_batches": 500,
+    "train": true,
+    "use_dora": false,
+    "val_batches": 25
+}
+`
+	f, err := os.Create(filepath.Join(tempDir, "adapter_config.json"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f.Close()
+
+	_, err = f.WriteString(configData)
+	if err != nil {
+		t.Fatal(err)
+	}
+}
--- a/convert/reader.go
+++ b/convert/reader.go
@ -64,6 +64,8 @@ func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) {
 	}{
 		{"model-*-of-*.safetensors", parseSafetensors},
 		{"model.safetensors", parseSafetensors},
+		{"adapters.safetensors", parseSafetensors},
+		{"adapter_model.safetensors", parseSafetensors},
 		{"pytorch_model-*-of-*.bin", parseTorch},
 		{"pytorch_model.bin", parseTorch},
 		{"consolidated.*.pth", parseTorch},
--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@ -4,6 +4,7 @@ import (
 	"bytes"
 	"encoding/binary"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"io"
 	"io/fs"
@ -50,6 +51,10 @@ func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]T

 		for _, key := range keys {
 			if value := headers[key]; value.Type != "" {
+				// bitsandbytes quantized models are unsupported
+				if len(value.Shape) == 0 {
+					return nil, errors.New("unsupported safetensors model")
+				}
 				ts = append(ts, safetensor{
 					fs:     fsys,
 					path:   p,
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@ -100,8 +100,21 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 		}

 		if template, ok := p["chat_template"]; ok {
-			if err := json.Unmarshal(template, &t.Template); err != nil {
-				return nil, err
+			var s []struct {
+				Name     string `json:"name"`
+				Template string `json:"template"`
+			}
+			if err := json.Unmarshal(template, &t.Template); err == nil {
+				// noop
+			} else if err := json.Unmarshal(template, &s); err == nil {
+				for _, e := range s {
+					if e.Name == "default" {
+						t.Template = e.Template
+						break
+					}
+				}
+			} else {
+				return nil, fmt.Errorf("invalid chat_template: %w", err)
 			}
 		}

@ -141,7 +154,6 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 }

 type tokenizer struct {
-	Version     string  `json:"version"`
 	AddedTokens []token `json:"added_tokens"`
 	Model       struct {
 		Type   string         `json:"type"`
@ -239,7 +251,7 @@ func parseVocabulary(fsys fs.FS) (*Vocabulary, error) {
 		return pattern.Func(fsys)
 	}

-	return nil, errors.New("unknown tensor format")
+	return nil, errors.New("unknown tokenizer format")
 }

 type SpecialVocabulary struct {
--- a/convert/tokenizer_test.go
+++ b/convert/tokenizer_test.go
@ -0,0 +1,208 @@
+package convert
+
+import (
+	"io"
+	"io/fs"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func createTokenizerFS(t *testing.T, dir string, files map[string]io.Reader) fs.FS {
+	t.Helper()
+
+	for k, v := range files {
+		if err := func() error {
+			f, err := os.Create(filepath.Join(dir, k))
+			if err != nil {
+				return err
+			}
+			defer f.Close()
+
+			if _, err := io.Copy(f, v); err != nil {
+				return err
+			}
+
+			return nil
+		}(); err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+	}
+
+	return os.DirFS(dir)
+}
+
+func TestParseTokenizer(t *testing.T) {
+	cases := []struct {
+		name              string
+		fsys              fs.FS
+		specialTokenTypes []string
+		want              *Tokenizer
+	}{
+		{
+			name: "string chat template",
+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
+				"tokenizer.json": strings.NewReader(`{}`),
+				"tokenizer_config.json": strings.NewReader(`{
+					"chat_template": "<default template>"
+				}`),
+			}),
+			want: &Tokenizer{
+				Vocabulary: &Vocabulary{Model: "gpt2"},
+				Pre:        "default",
+				Template:   "<default template>",
+			},
+		},
+		{
+			name: "list chat template",
+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
+				"tokenizer.json": strings.NewReader(`{}`),
+				"tokenizer_config.json": strings.NewReader(`{
+					"chat_template": [
+						{
+							"name": "default",
+							"template": "<default template>"
+						},
+						{
+							"name": "tools",
+							"template": "<tools template>"
+						}
+					]
+				}`),
+			}),
+			want: &Tokenizer{
+				Vocabulary: &Vocabulary{Model: "gpt2"},
+				Pre:        "default",
+				Template:   "<default template>",
+			},
+		},
+		{
+			name: "added tokens",
+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
+				"tokenizer.json": strings.NewReader(`{
+					"added_tokens": [
+						{
+							"id": 999,
+							"content": "<unused999>",
+							"special": false
+						}
+					]
+				}`),
+			}),
+			want: &Tokenizer{
+				Vocabulary: &Vocabulary{
+					Model:  "gpt2",
+					Tokens: []string{"<unused999>"},
+					Scores: []float32{999},
+					Types:  []int32{4},
+				},
+				Pre: "default",
+			},
+		},
+		{
+			name: "added tokens overlap vocab",
+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
+				"tokenizer.json": strings.NewReader(`{
+					"added_tokens": [
+						{
+							"id": 0,
+							"content": "<pad>",
+							"special": true
+						}
+					],
+					"model": {
+						"vocab": {
+							"<pad>": 0
+						}
+					}
+				}`),
+			}),
+			want: &Tokenizer{
+				Vocabulary: &Vocabulary{
+					Model:  "gpt2",
+					Tokens: []string{"<pad>"},
+					Scores: []float32{0},
+					Types:  []int32{3},
+				},
+				Pre: "default",
+			},
+		},
+		{
+			name: "special token types",
+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
+				"tokenizer.json": strings.NewReader(`{
+					"added_tokens": [
+						{
+							"id": 0,
+							"content": "<pad>",
+							"special": true
+						},
+						{
+							"id": 1,
+							"content": "<eos>",
+							"special": true
+						},
+						{
+							"id": 2,
+							"content": "<bos>",
+							"special": true
+						},
+						{
+							"id": 3,
+							"content": "<unk>",
+							"special": true
+						}
+					],
+					"model": {
+						"vocab": {
+							"<pad>": 0,
+							"<eos>": 1,
+							"<bos>": 2,
+							"<unk>": 3
+						}
+					}
+				}`),
+				"tokenizer_config.json": strings.NewReader(`{
+					"add_bos_token": true,
+					"add_eos_token": false,
+					"bos_token": "<bos>",
+					"eos_token": "<eos>",
+					"pad_token": "<pad>",
+					"unk_token": "<unk>"
+				}`),
+			}),
+			specialTokenTypes: []string{"pad", "eos", "bos", "unk"},
+			want: &Tokenizer{
+				Vocabulary: &Vocabulary{
+					Model:  "gpt2",
+					Tokens: []string{"<pad>", "<eos>", "<bos>", "<unk>"},
+					Scores: []float32{0, 1, 2, 3},
+					Types:  []int32{3, 3, 3, 3},
+				},
+				SpecialVocabulary: []*SpecialVocabulary{
+					{Type: "pad", Content: "<pad>", ID: 0, AddToken: false},
+					{Type: "eos", Content: "<eos>", ID: 1, AddToken: false},
+					{Type: "bos", Content: "<bos>", ID: 2, AddToken: true},
+					{Type: "unk", Content: "<unk>", ID: 3, AddToken: false},
+				},
+				Pre: "default",
+			},
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			tokenizer, err := parseTokenizer(tt.fsys, tt.specialTokenTypes)
+			if err != nil {
+				t.Fatalf("unexpected error: %v", err)
+			}
+
+			if diff := cmp.Diff(tt.want, tokenizer); diff != "" {
+				t.Errorf("unexpected tokenizer (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
--- a/docs/faq.md
+++ b/docs/faq.md
@ -111,7 +111,10 @@ On Windows, Ollama inherits your user and system environment variables.

 ## How do I use Ollama behind a proxy?

-Ollama is compatible with proxy servers if `HTTP_PROXY` or `HTTPS_PROXY` are configured. When using either variables, ensure it is set where `ollama serve` can access the values. When using `HTTPS_PROXY`, ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
+Ollama pulls models from the Internet and may require a proxy server to access the models. Use `HTTPS_PROXY` to redirect outbound requests through the proxy. Ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
+
+> [!NOTE]
+> Avoid setting `HTTP_PROXY`. Ollama does not use HTTP for model pulls, only HTTPS. Setting `HTTP_PROXY` may interrupt client connections to the server.

 ### How do I use Ollama behind a proxy in Docker?

@ -191,6 +194,8 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e

 If a different directory needs to be used, set the environment variable `OLLAMA_MODELS` to the chosen directory.

+> Note: on Linux using the standard installer, the `ollama` user needs read and write access to the specified directory. To assign the directory to the `ollama` user run `sudo chown -R ollama:ollama <directory>`.
+
 Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.

 ## How can I use Ollama in Visual Studio Code?
@ -276,4 +281,4 @@ Note: Windows with Radeon GPUs currently default to 1 model maximum due to limit

 ## How does Ollama load models on multiple GPUs?

-Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models.  When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available.  If the model will entirely fit on any single GPU, Ollama will load the model on that GPU.  This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference.  If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
+Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models.  When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available.  If the model will entirely fit on any single GPU, Ollama will load the model on that GPU.  This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference.  If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
--- a/docs/images/ollama-keys.png
+++ b/docs/images/ollama-keys.png
--- a/docs/images/signup.png
+++ b/docs/images/signup.png
--- a/docs/import.md
+++ b/docs/import.md
@ -1,44 +1,129 @@
-# Import
+# Importing a model

-GGUF models and select Safetensors models can be imported directly into Ollama.
+## Table of Contents

-## Import GGUF
+  * [Importing a Safetensors adapter](#Importing-a-fine-tuned-adapter-from-Safetensors-weights)
+  * [Importing a Safetensors model](#Importing-a-model-from-Safetensors-weights)
+  * [Importing a GGUF file](#Importing-a-GGUF-based-model-or-adapter)
+  * [Sharing models on ollama.com](#Sharing-your-model-on-ollamacom)

-A binary GGUF file can be imported directly into Ollama through a Modelfile.
+## Importing a fine tuned adapter from Safetensors weights
+
+First, create a `Modelfile` with a `FROM` command pointing at the base model you used for fine tuning, and an `ADAPTER` command which points to the directory with your Safetensors adapter:

 ```dockerfile
-FROM /path/to/file.gguf
+FROM <base model name>
+ADAPTER /path/to/safetensors/adapter/directory
 ```

-## Import Safetensors
+Make sure that you use the same base model in the `FROM` command as you used to create the adapter otherwise you will get erratic results. Most frameworks use different quantization methods, so it's best to use non-quantized (i.e. non-QLoRA) adapters. If your adapter is in the same directory as your `Modelfile`, use `ADAPTER .` to specify the adapter path.

-If the model being imported is one of these architectures, it can be imported directly into Ollama through a Modelfile:
+Now run `ollama create` from the directory where the `Modelfile` was created:

- - LlamaForCausalLM
- - MistralForCausalLM
- - MixtralForCausalLM
- - GemmaForCausalLM
- - Phi3ForCausalLM
+```bash
+ollama create my-model
+```
+
+Lastly, test the model:
+
+```bash
+ollama run my-model
+```
+
+Ollama supports importing adapters based on several different model architectures including:
+
+  * Llama (including Llama 2, Llama 3, and Llama 3.1);
+  * Mistral (including Mistral 1, Mistral 2, and Mixtral); and
+  * Gemma (including Gemma 1 and Gemma 2)
+
+You can create the adapter using a fine tuning framework or tool which can output adapters in the Safetensors format, such as:
+
+  * Hugging Face [fine tuning framework] (https://huggingface.co/docs/transformers/en/training)
+  * [Unsloth](https://github.com/unslothai/unsloth)
+  * [MLX](https://github.com/ml-explore/mlx)
+
+
+## Importing a model from Safetensors weights
+
+First, create a `Modelfile` with a `FROM` command which points to the directory containing your Safetensors weights:

 ```dockerfile
 FROM /path/to/safetensors/directory
 ```

-For architectures not directly convertable by Ollama, see llama.cpp's [guide](https://github.com/ggerganov/llama.cpp/blob/master/README.md#prepare-and-quantize) on conversion. After conversion, see [Import GGUF](#import-gguf).
+If you create the Modelfile in the same directory as the weights, you can use the command `FROM .`.

-## Automatic Quantization
+Now run the `ollama create` command from the directory where you created the `Modelfile`:

-> [!NOTE]
-> Automatic quantization requires v0.1.35 or higher.
+```shell
+ollama create my-model
+```

-Ollama is capable of quantizing FP16 or FP32 models to any of the supported quantizations with the `-q/--quantize` flag in `ollama create`.
+Lastly, test the model:
+
+```shell
+ollama run my-model
+```
+
+Ollama supports importing models for several different architectures including:
+
+  * Llama (including Llama 2, Llama 3, and Llama 3.1);
+  * Mistral (including Mistral 1, Mistral 2, and Mixtral);
+  * Gemma (including Gemma 1 and Gemma 2); and
+  * Phi3
+
+This includes importing foundation models as well as any fine tuned models which which have been _fused_ with a foundation model.
+
+
+## Importing a GGUF based model or adapter
+
+If you have a GGUF based model or adapter it is possible to import it into Ollama. You can obtain a GGUF model or adapter by:
+
+  * converting a Safetensors model with the `convert_hf_to_gguf.py` from Llama.cpp; 
+  * converting a Safetensors adapter with the `convert_lora_to_gguf.py` from Llama.cpp; or
+  * downloading a model or adapter from a place such as HuggingFace
+
+To import a GGUF model, create a `Modelfile` containg:
+
+```dockerfile
+FROM /path/to/file.gguf
+```
+
+For a GGUF adapter, create the `Modelfile` with:
+
+```dockerfile
+FROM <model name>
+ADAPTER /path/to/file.gguf
+```
+
+When importing a GGUF adapter, it's important to use the same base model as the base model that the adapter was created with. You can use:
+
+ * a model from Ollama
+ * a GGUF file
+ * a Safetensors based model 
+
+Once you have created your `Modelfile`, use the `ollama create` command to build the model.
+
+```shell
+ollama create my-model
+```
+
+## Quantizing a Model
+
+Quantizing a model allows you to run models faster and with less memory consumption but at reduced accuracy. This allows you to run a model on more modest hardware.
+
+Ollama can quantize FP16 and FP32 based models into different quantization levels using the `-q/--quantize` flag with the `ollama create` command.
+
+First, create a Modelfile with the FP16 or FP32 based model you wish to quantize.

 ```dockerfile
 FROM /path/to/my/gemma/f16/model
 ```

+Use `ollama create` to then create the quantized model.
+
 ```shell
-$ ollama create -q Q4_K_M mymodel
+$ ollama create --quantize q4_K_M mymodel
 transferring model data
 quantizing F16 model to Q4_K_M
 creating new layer sha256:735e246cc1abfd06e9cdcf95504d6789a6cd1ad7577108a70d9902fef503c1bd
@ -49,42 +134,53 @@ success

 ### Supported Quantizations

- `Q4_0`
- `Q4_1`
- `Q5_0`
- `Q5_1`
- `Q8_0`
+- `q4_0`
+- `q4_1`
+- `q5_0`
+- `q5_1`
+- `q8_0`

 #### K-means Quantizations

- `Q3_K_S`
- `Q3_K_M`
- `Q3_K_L`
- `Q4_K_S`
- `Q4_K_M`
- `Q5_K_S`
- `Q5_K_M`
- `Q6_K`
+- `q3_K_S`
+- `q3_K_M`
+- `q3_K_L`
+- `q4_K_S`
+- `q4_K_M`
+- `q5_K_S`
+- `q5_K_M`
+- `q6_K`

-## Template Detection

-> [!NOTE]
-> Template detection requires v0.1.42 or higher.
+## Sharing your model on ollama.com

-Ollama uses model metadata, specifically `tokenizer.chat_template`, to automatically create a template appropriate for the model you're importing.
+You can share any model you have created by pushing it to [ollama.com](https://ollama.com) so that other users can try it out.

-```dockerfile
-FROM /path/to/my/gemma/model
-```
+First, use your browser to go to the [Ollama Sign-Up](https://ollama.com/signup) page. If you already have an account, you can skip this step.
+
+<img src="images/signup.png" alt="Sign-Up" width="40%">
+
+The `Username` field will be used as part of your model's name (e.g. `jmorganca/mymodel`), so make sure you are comfortable with the username that you have selected.
+
+Now that you have created an account and are signed-in, go to the [Ollama Keys Settings](https://ollama.com/settings/keys) page.
+
+Follow the directions on the page to determine where your Ollama Public Key is located.
+
+<img src="images/ollama-keys.png" alt="Ollama Keys" width="80%">
+
+Click on the `Add Ollama Public Key` button, and copy and paste the contents of your Ollama Public Key into the text field.
+
+To push a model to [ollama.com](https://ollama.com), first make sure that it is named correctly with your username. You may have to use the `ollama cp` command to copy
+your model to give it the correct name. Once you're happy with your model's name, use the `ollama push` command to push it to [ollama.com](https://ollama.com).

 ```shell
-$ ollama create mymodel
-transferring model data
-using autodetected template gemma-instruct
-creating new layer sha256:baa2a0edc27d19cc6b7537578a9a7ba1a4e3214dc185ed5ae43692b319af7b84
-creating new layer sha256:ba66c3309914dbef07e5149a648fd1877f030d337a4f240d444ea335008943cb
-writing manifest
-success
+ollama cp mymodel myuser/mymodel
+ollama push myuser/mymodel
+```
+
+Once your model has been pushed, other users can pull and run it by using the command:
+
+```shell
+ollama run myuser/mymodel
 ```

-Defining a template in the Modelfile will disable this feature which may be useful if you want to use a different template than the autodetected one.
--- a/docs/linux.md
+++ b/docs/linux.md
@ -28,12 +28,18 @@ Download and extract the Linux package:
 curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr
 ```

+If you have an AMD GPU, also download and extract the ROCm package into the same location
+```bash
+curl -fsSL https://ollama.com/download/ollama-linux-amd64-rocm.tgz | sudo tar zx -C /usr
+```
+
 ### Adding Ollama as a startup service (recommended)

-Create a user for Ollama:
+Create a user and group for Ollama:

 ```bash
-sudo useradd -r -s /bin/false -m -d /usr/share/ollama ollama
+sudo useradd -r -s /bin/false -U -m -d /usr/share/ollama ollama
+sudo usermod -a -G ollama $(whoami)
 ```

 Create a service file in `/etc/systemd/system/ollama.service`:
@ -49,6 +55,7 @@ User=ollama
 Group=ollama
 Restart=always
 RestartSec=3
+Environment="PATH=$PATH"

 [Install]
 WantedBy=default.target
@ -78,10 +85,11 @@ Make sure to install ROCm v6

 ### Start Ollama

-Start Ollama using `systemd`:
+Start Ollama and verify it is running:

 ```bash
 sudo systemctl start ollama
+sudo systemctl status ollama
 ```

 ## Update
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@ -11,8 +11,9 @@ A model file is the blueprint to create and share models with Ollama.
 - [Examples](#examples)
 - [Instructions](#instructions)
  - [FROM (Required)](#from-required)
-    - [Build from llama3](#build-from-llama3)
-    - [Build from a bin file](#build-from-a-bin-file)
+    - [Build from llama3.1](#build-from-llama31)
+    - [Build from a Safetensors model](#build-from-a-safetensors-model)
+    - [Build from a GGUF file](#build-from-a-gguf-file)
  - [PARAMETER](#parameter)
    - [Valid Parameters and Values](#valid-parameters-and-values)
  - [TEMPLATE](#template)
@ -99,22 +100,39 @@ The `FROM` instruction defines the base model to use when creating a model.
 FROM <model name>:<tag>
 ```

-#### Build from llama3
+#### Build from llama3.1

 ```modelfile
-FROM llama3
+FROM llama3.1
 ```

 A list of available base models:
 <https://github.com/ollama/ollama#model-library>
+Additional models can be found at:
+<https://ollama.com/library>

-#### Build from a `bin` file
+#### Build from a Safetensors model

 ```modelfile
-FROM ./ollama-model.bin
+FROM <model directory>
 ```

-This bin file location should be specified as an absolute path or relative to the `Modelfile` location.
+The model directory should contain the Safetensors weights for a supported architecture.
+
+Currently supported model architectures:
+  * Llama (including Llama 2, Llama 3, and Llama 3.1)
+  * Mistral (including Mistral 1, Mistral 2, and Mixtral)
+  * Gemma (including Gemma 1 and Gemma 2)
+  * Phi3
+
+#### Build from a GGUF file
+
+```modelfile
+FROM ./ollama-model.gguf
+```
+
+The GGUF file location should be specified as an absolute path or relative to the `Modelfile` location.
+

 ### PARAMETER

@ -174,10 +192,23 @@ SYSTEM """<system message>"""

 ### ADAPTER

-The `ADAPTER` instruction is an optional instruction that specifies any LoRA adapter that should apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGML file format. The adapter should be tuned from the base model otherwise the behaviour is undefined.
+The `ADAPTER` instruction specifies a fine tuned LoRA adapter that should apply to the base model. The value of the adapter should be an absolute path or a path relative to the Modelfile. The base model should be specified with a `FROM` instruction. If the base model is not the same as the base model that the adapter was tuned from the behaviour will be erratic.
+
+#### Safetensor adapter

 ```modelfile
-ADAPTER ./ollama-lora.bin
+ADAPTER <path to safetensor adapter>
+```
+
+Currently supported Safetensor adapters:
+  * Llama (including Llama 2, Llama 3, and Llama 3.1)
+  * Mistral (including Mistral 1, Mistral 2, and Mixtral)
+  * Gemma (including Gemma 1 and Gemma 2)
+
+#### GGUF adapter
+
+```modelfile
+ADAPTER ./ollama-lora.gguf
 ```

 ### LICENSE
--- a/docs/openai.md
+++ b/docs/openai.md
@ -300,3 +300,28 @@ curl http://localhost:11434/v1/chat/completions \
        ]
    }'
 ```
+
+### Setting the context size
+
+The OpenAI API does not have a way of setting the context size for a model. If you need to change the context size, create a `Modelfile` which looks like:
+
+```modelfile
+FROM <some model>
+PARAMETER num_ctx <context size>
+```
+
+Use the `ollama create mymodel` command to create a new model with the updated context size. Call the API with the updated model name:
+
+```shell
+curl http://localhost:11434/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "mymodel",
+        "messages": [
+            {
+                "role": "user",
+                "content": "Hello!"
+            }
+        ]
+    }'
+```
--- a/envconfig/config.go
+++ b/envconfig/config.go
@ -30,9 +30,7 @@ func Host() *url.URL {
 		defaultPort = "443"
 	}

-	// trim trailing slashes
-	hostport = strings.TrimRight(hostport, "/")
-
+	hostport, path, _ := strings.Cut(hostport, "/")
 	host, port, err := net.SplitHostPort(hostport)
 	if err != nil {
 		host, port = "127.0.0.1", defaultPort
@ -45,15 +43,13 @@ func Host() *url.URL {

 	if n, err := strconv.ParseInt(port, 10, 32); err != nil || n > 65535 || n < 0 {
 		slog.Warn("invalid port, using default", "port", port, "default", defaultPort)
-		return &url.URL{
-			Scheme: scheme,
-			Host:   net.JoinHostPort(host, defaultPort),
-		}
+		port = defaultPort
 	}

 	return &url.URL{
 		Scheme: scheme,
 		Host:   net.JoinHostPort(host, port),
+		Path:   path,
 	}
 }

@ -190,7 +186,7 @@ func RunnersDir() (p string) {
 	}

 	var paths []string
-	for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), ".."), cwd} {
+	for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), LibRelativeToExe()), cwd} {
 		paths = append(paths,
 			root,
 			filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
@ -282,3 +278,12 @@ func Values() map[string]string {
 func Var(key string) string {
 	return strings.Trim(strings.TrimSpace(os.Getenv(key)), "\"'")
 }
+
+// On windows, we keep the binary at the top directory, but
+// other platforms use a "bin" directory, so this returns ".."
+func LibRelativeToExe() string {
+	if runtime.GOOS == "windows" {
+		return "."
+	}
+	return ".."
+}
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@ -13,34 +13,35 @@ func TestHost(t *testing.T) {
 		value  string
 		expect string
 	}{
-		"empty":               {"", "127.0.0.1:11434"},
-		"only address":        {"1.2.3.4", "1.2.3.4:11434"},
-		"only port":           {":1234", ":1234"},
-		"address and port":    {"1.2.3.4:1234", "1.2.3.4:1234"},
-		"hostname":            {"example.com", "example.com:11434"},
-		"hostname and port":   {"example.com:1234", "example.com:1234"},
-		"zero port":           {":0", ":0"},
-		"too large port":      {":66000", ":11434"},
-		"too small port":      {":-1", ":11434"},
-		"ipv6 localhost":      {"[::1]", "[::1]:11434"},
-		"ipv6 world open":     {"[::]", "[::]:11434"},
-		"ipv6 no brackets":    {"::1", "[::1]:11434"},
-		"ipv6 + port":         {"[::1]:1337", "[::1]:1337"},
-		"extra space":         {" 1.2.3.4 ", "1.2.3.4:11434"},
-		"extra quotes":        {"\"1.2.3.4\"", "1.2.3.4:11434"},
-		"extra space+quotes":  {" \" 1.2.3.4 \" ", "1.2.3.4:11434"},
-		"extra single quotes": {"'1.2.3.4'", "1.2.3.4:11434"},
-		"http":                {"http://1.2.3.4", "1.2.3.4:80"},
-		"http port":           {"http://1.2.3.4:4321", "1.2.3.4:4321"},
-		"https":               {"https://1.2.3.4", "1.2.3.4:443"},
-		"https port":          {"https://1.2.3.4:4321", "1.2.3.4:4321"},
+		"empty":               {"", "http://127.0.0.1:11434"},
+		"only address":        {"1.2.3.4", "http://1.2.3.4:11434"},
+		"only port":           {":1234", "http://:1234"},
+		"address and port":    {"1.2.3.4:1234", "http://1.2.3.4:1234"},
+		"hostname":            {"example.com", "http://example.com:11434"},
+		"hostname and port":   {"example.com:1234", "http://example.com:1234"},
+		"zero port":           {":0", "http://:0"},
+		"too large port":      {":66000", "http://:11434"},
+		"too small port":      {":-1", "http://:11434"},
+		"ipv6 localhost":      {"[::1]", "http://[::1]:11434"},
+		"ipv6 world open":     {"[::]", "http://[::]:11434"},
+		"ipv6 no brackets":    {"::1", "http://[::1]:11434"},
+		"ipv6 + port":         {"[::1]:1337", "http://[::1]:1337"},
+		"extra space":         {" 1.2.3.4 ", "http://1.2.3.4:11434"},
+		"extra quotes":        {"\"1.2.3.4\"", "http://1.2.3.4:11434"},
+		"extra space+quotes":  {" \" 1.2.3.4 \" ", "http://1.2.3.4:11434"},
+		"extra single quotes": {"'1.2.3.4'", "http://1.2.3.4:11434"},
+		"http":                {"http://1.2.3.4", "http://1.2.3.4:80"},
+		"http port":           {"http://1.2.3.4:4321", "http://1.2.3.4:4321"},
+		"https":               {"https://1.2.3.4", "https://1.2.3.4:443"},
+		"https port":          {"https://1.2.3.4:4321", "https://1.2.3.4:4321"},
+		"proxy path":          {"https://example.com/ollama", "https://example.com:443/ollama"},
 	}

 	for name, tt := range cases {
 		t.Run(name, func(t *testing.T) {
 			t.Setenv("OLLAMA_HOST", tt.value)
-			if host := Host(); host.Host != tt.expect {
-				t.Errorf("%s: expected %s, got %s", name, tt.expect, host.Host)
+			if host := Host(); host.String() != tt.expect {
+				t.Errorf("%s: expected %s, got %s", name, tt.expect, host.String())
 			}
 		})
 	}
--- a/gpu/amd_common.go
+++ b/gpu/amd_common.go
@ -9,6 +9,8 @@ import (
 	"path/filepath"
 	"runtime"
 	"strings"
+
+	"github.com/ollama/ollama/envconfig"
 )

 // Determine if the given ROCm lib directory is usable by checking for existence of some glob patterns
@ -54,7 +56,7 @@ func commonAMDValidateLibDir() (string, error) {
 	// Installer payload location if we're running the installed binary
 	exe, err := os.Executable()
 	if err == nil {
-		rocmTargetDir := filepath.Join(filepath.Dir(exe), "..", "lib", "ollama")
+		rocmTargetDir := filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama")
 		if rocmLibUsable(rocmTargetDir) {
 			slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
 			return rocmTargetDir, nil
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@ -153,7 +153,7 @@ func AMDValidateLibDir() (string, error) {
 	// Installer payload (if we're running from some other location)
 	localAppData := os.Getenv("LOCALAPPDATA")
 	appDir := filepath.Join(localAppData, "Programs", "Ollama")
-	rocmTargetDir := filepath.Join(appDir, "..", "lib", "ollama")
+	rocmTargetDir := filepath.Join(appDir, envconfig.LibRelativeToExe(), "lib", "ollama")
 	if rocmLibUsable(rocmTargetDir) {
 		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
 		return rocmTargetDir, nil
--- a/gpu/cuda_common.go
+++ b/gpu/cuda_common.go
@ -57,7 +57,7 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {
 		}
 	}

-	if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 {
+	if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
 		return "v11"
 	}
 	return "v12"
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@ -264,6 +264,8 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.computeMajor = int(memInfo.major)
 				gpuInfo.computeMinor = int(memInfo.minor)
 				gpuInfo.MinimumMemory = cudaMinimumMemory
+				gpuInfo.DriverMajor = driverMajor
+				gpuInfo.DriverMinor = driverMinor
 				variant := cudaVariant(gpuInfo)
 				if depPath != "" {
 					gpuInfo.DependencyPath = depPath
@ -275,8 +277,6 @@ func GetGPUInfo() GpuInfoList {
 					}
 				}
 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-				gpuInfo.DriverMajor = driverMajor
-				gpuInfo.DriverMinor = driverMinor
 				gpuInfo.Variant = variant

 				// query the management library as well so we can record any skew between the two
@ -653,7 +653,7 @@ func LibraryDir() string {
 		slog.Warn("failed to lookup working directory", "error", err)
 	}
 	// Scan for any of our dependeices, and pick first match
-	for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), ".."), cwd} {
+	for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), envconfig.LibRelativeToExe()), cwd} {
 		libDep := filepath.Join("lib", "ollama")
 		if _, err := os.Stat(filepath.Join(root, libDep)); err == nil {
 			return filepath.Join(root, libDep)
--- a/gpu/gpu_test.go
+++ b/gpu/gpu_test.go
@ -32,4 +32,29 @@ func TestCPUMemInfo(t *testing.T) {
 	}
 }

+func TestByLibrary(t *testing.T) {
+	type testCase struct {
+		input  []GpuInfo
+		expect int
+	}
+
+	testCases := map[string]*testCase{
+		"empty":                    {input: []GpuInfo{}, expect: 0},
+		"cpu":                      {input: []GpuInfo{{Library: "cpu"}}, expect: 1},
+		"cpu + GPU":                {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}}, expect: 2},
+		"cpu + 2 GPU no variant":   {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}, {Library: "cuda"}}, expect: 2},
+		"cpu + 2 GPU same variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v11"}}, expect: 2},
+		"cpu + 2 GPU diff variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v12"}}, expect: 3},
+	}
+
+	for k, v := range testCases {
+		t.Run(k, func(t *testing.T) {
+			resp := (GpuInfoList)(v.input).ByLibrary()
+			if len(resp) != v.expect {
+				t.Fatalf("expected length %d, got %d => %+v", v.expect, len(resp), resp)
+			}
+		})
+	}
+}
+
 // TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
--- a/gpu/types.go
+++ b/gpu/types.go
@ -94,7 +94,7 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 			}
 		}
 		if !found {
-			libs = append(libs, info.Library)
+			libs = append(libs, requested)
 			resp = append(resp, []GpuInfo{info})
 		}
 	}
--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@ -2,7 +2,7 @@ set(TARGET ollama_llama_server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
 set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
+add_executable(${TARGET} server.cpp utils.hpp httplib.h)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
--- a/llm/ext_server/json.hpp
+++ b/llm/ext_server/json.hpp
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@ -262,7 +262,7 @@ struct server_slot {
       char buffer[512];
        double t_token = t_prompt_processing / n_prompt_tokens_processed;
        double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
-        sprintf(buffer, "prompt eval time     = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
+        snprintf(buffer, sizeof(buffer), "prompt eval time     = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
                t_prompt_processing, n_prompt_tokens_processed,
                t_token, n_tokens_second);
        LOG_DEBUG(buffer, {
@ -276,7 +276,7 @@ struct server_slot {

        t_token = t_token_generation / n_decoded;
        n_tokens_second = 1e3 / t_token_generation * n_decoded;
-        sprintf(buffer, "generation eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)",
+        snprintf(buffer, sizeof(buffer), "generation eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)",
                t_token_generation, n_decoded,
                t_token, n_tokens_second);
        LOG_DEBUG(buffer, {
@ -288,7 +288,7 @@ struct server_slot {
            {"n_tokens_second",    n_tokens_second},
        });

-        sprintf(buffer, "          total time = %10.2f ms", t_prompt_processing + t_token_generation);
+        snprintf(buffer, sizeof(buffer), "          total time = %10.2f ms", t_prompt_processing + t_token_generation);
        LOG_DEBUG(buffer, {
            {"slot_id",             id},
            {"task_id",             task_id},
@ -425,7 +425,7 @@ struct llama_server_context

        n_ctx = llama_n_ctx(ctx);

-        add_bos_token = llama_should_add_bos_token(model);
+        add_bos_token = llama_add_bos_token(model);

        return true;
    }
@ -1031,7 +1031,7 @@ struct llama_server_context
                continue;
            }

-            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
+            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
                LOG_TEE("Error processing the given image");
                return false;
            }
@ -2014,7 +2014,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("options:\n");
    printf("  -h, --help                show this help message and exit\n");
    printf("  -v, --verbose             verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
-    printf("  -t N, --threads N         number of threads to use during computation (default: %d)\n", params.n_threads);
+    printf("  -t N, --threads N         number of threads to use during computation (default: %d)\n", params.cpuparams.n_threads);
    printf("  -tb N, --threads-batch N  number of threads to use during batch and prompt processing (default: same as --threads)\n");
    printf("  --threads-http N          number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n");
    printf("  -c N, --ctx-size N        size of the prompt context (default: %d)\n", params.n_ctx);
@ -2287,7 +2287,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
                invalid_param = true;
                break;
            }
-            params.n_threads = std::stoi(argv[i]);
+            params.cpuparams.n_threads = std::stoi(argv[i]);
        }
        else if (arg == "--grp-attn-n" || arg == "-gan")
        {
@ -2315,7 +2315,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
                invalid_param = true;
                break;
            }
-            params.n_threads_batch = std::stoi(argv[i]);
+            params.cpuparams_batch.n_threads = std::stoi(argv[i]);
        }
        else if (arg == "--threads-http")
        {
@ -2626,6 +2626,11 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
        params.kv_overrides.back().key[0] = 0;
    }

+    postprocess_cpu_params(params.cpuparams, nullptr);
+    postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
+    postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
+    postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
+
    if (invalid_param)
    {
        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
@ -2775,8 +2780,8 @@ int main(int argc, char **argv) {
                            {"commit", LLAMA_COMMIT}});

    LOG_INFO("system info", {
-                                {"n_threads", params.n_threads},
-                                {"n_threads_batch", params.n_threads_batch},
+                                {"n_threads", params.cpuparams.n_threads},
+                                {"n_threads_batch", params.cpuparams_batch.n_threads},
                                {"total_threads", std::thread::hardware_concurrency()},
                                {"system_info", llama_print_system_info()},
                            });
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@ -87,6 +87,8 @@ apply_patches() {
 build() {
    cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
    cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
+    # remove unnecessary build artifacts
+    rm -f ${BUILD_DIR}/bin/ggml-common.h ${BUILD_DIR}/bin/ggml-metal.metal
 }

 compress() {
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@ -19,7 +19,7 @@ sign() {
    fi
 }

-COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
+COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DGGML_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"

 case "${GOARCH}" in
 "amd64")
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@ -252,7 +252,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
        ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
    fi
    init_vars
-    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DLLAMA_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
+    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DGGML_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
    # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
    if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
        echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@ -355,7 +355,7 @@ function build_rocm() {
            "-DCMAKE_C_COMPILER=clang.exe",
            "-DCMAKE_CXX_COMPILER=clang++.exe",
            "-DGGML_HIPBLAS=on",
-            "-DLLAMA_CUDA_NO_PEER_COPY=on",
+            "-DGGML_CUDA_NO_PEER_COPY=on",
            "-DHIP_PLATFORM=amd",
            "-DGGML_AVX=on",
            "-DGGML_AVX2=off",
--- a/llm/ggml.go
+++ b/llm/ggml.go
@ -43,6 +43,14 @@ func (kv KV) Architecture() string {
 	return "unknown"
 }

+func (kv KV) Kind() string {
+	if s, ok := kv["general.type"].(string); ok {
+		return s
+	}
+
+	return "unknown"
+}
+
 func (kv KV) ParameterCount() uint64 {
 	return kv.u64("general.parameter_count")
 }
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
@ -1 +1 @@
-Subproject commit 1e6f6554aa11fa10160a5fda689e736c3c34169f
+Subproject commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177
--- a/llm/patches/05-default-pretokenizer.diff
+++ b/llm/patches/05-default-pretokenizer.diff
@ -1,8 +1,8 @@
 diff --git a/src/llama.cpp b/src/llama.cpp
-index a207451f..2ddf431d 100644
+index 88355971..dd7d41ed 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -5347,16 +5347,7 @@ static void llm_load_vocab(
+@@ -6083,16 +6083,7 @@ static void llm_load_vocab(
         if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
             vocab.tokenizer_add_space_prefix = false;
             vocab.tokenizer_clean_spaces = true;
@ -20,9 +20,9 @@ index a207451f..2ddf431d 100644
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
-@@ -5443,7 +5434,8 @@ static void llm_load_vocab(
-                 tokenizer_pre == "codeshell") {
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
+@@ -6188,7 +6179,8 @@ static void llm_load_vocab(
+                 tokenizer_pre == "exaone") {
+                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
             } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
--- a/llm/patches/06-embeddings.diff
+++ b/llm/patches/06-embeddings.diff
@ -1,37 +1,36 @@
 diff --git a/src/llama.cpp b/src/llama.cpp
-index 1fe2b9f7..a43312a7 100644
+index 88355971..d7db689b 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -13689,7 +13689,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
+@@ -15906,7 +15906,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
     const auto n_embd  = hparams.n_embd;
 
     // TODO: use a per-batch flag for logits presence instead
 -    const bool has_logits = !cparams.embeddings;
 +    const bool has_logits =  cparams.causal_attn;
-     const bool has_embd   =  lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE));
+     const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
 
     const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-@@ -13959,17 +13959,25 @@ static int llama_decode_internal(
+@@ -16175,20 +16175,23 @@ static int llama_decode_internal(
             // no output
             res  = nullptr;
             embd = nullptr;
 -        } else if (cparams.embeddings) {
-            res = nullptr; // do not extract logits for embedding case
-            embd = gf->nodes[gf->n_nodes - 1];
-            if (strcmp(embd->name, "result_embd_pooled") != 0) {
-                embd = gf->nodes[gf->n_nodes - 2];
+-            res  = nullptr; // do not extract logits for embedding case
+-            embd = nullptr;
 +        }
 +
 +        if (cparams.embeddings) {
-+            for (int i = gf->n_nodes - 1; i >= 0; --i) {
+             for (int i = gf->n_nodes - 1; i >= 0; --i) {
+-                if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
+-                    embd = gf->nodes[i];
 +                embd = gf->nodes[i];
 +                if (strcmp(embd->name, "result_embd_pooled") == 0) {
-+                    break;
-+                }
+                     break;
+                 }
             }
-             GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
-        } else {
-+         } else {
+-            GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
+         } else {
             embd = nullptr; // do not extract embeddings when not needed
             GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
         }
@ -39,7 +38,6 @@ index 1fe2b9f7..a43312a7 100644
 +        if (!cparams.causal_attn) {
 +            res = nullptr; // do not extract logits when not needed
 +        }
-+
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
         ggml_backend_sched_alloc_graph(lctx.sched, gf);
--- a/llm/patches/09-lora.diff
+++ b/llm/patches/09-lora.diff
@ -1,350 +0,0 @@
-diff --git a/common/common.cpp b/common/common.cpp
-index 2e8374d5..70d0afde 100644
--- a/common/common.cpp
-+++ b/common/common.cpp
-@@ -2110,9 +2110,21 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
-         loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
-         if (loaded_la.adapter == nullptr) {
-             fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
-            llama_free(lctx);
-            llama_free_model(model);
-            return iparams;
-+
-+            // if that fails, try loading as ggla for compatibility
-+            int err = llama_model_apply_lora_from_file(model,
-+                                                    la.path.c_str(),
-+                                                    la.scale,
-+                                                    nullptr,
-+                                                    params.n_threads);
-+            if (err != 0) {
-+                fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
-+                llama_free(lctx);
-+                llama_free_model(model);
-+                return iparams;
-+            } else {
-+                break;
-+            }
-         }
-         iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
-     }
-diff --git a/include/llama.h b/include/llama.h
-index 93fd77ca..b0fb37a6 100644
--- a/include/llama.h
-+++ b/include/llama.h
-@@ -1160,6 +1160,20 @@ extern "C" {
- 
-     LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
- 
-+    // Apply a LoRA adapter to a loaded model
-+    // path_base_model is the path to a higher quality model to use as a base for
-+    // the layers modified by the adapter. Can be NULL to use the current loaded model.
-+    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
-+    // will be applied on top of the previous one
-+    // Returns 0 on success
-+    LLAMA_API int32_t llama_model_apply_lora_from_file(
-+            const struct llama_model * model,
-+                            const char * path_lora,
-+                                float   scale,
-+                            const char * path_base_model,
-+                                int32_t   n_threads);
-+
-+
- #ifdef __cplusplus
- }
- #endif
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 80a0dd0f..9d7b0e17 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -21880,3 +21880,290 @@ static void llama_log_callback_default(ggml_log_level level, const char * text,
-     fputs(text, stderr);
-     fflush(stderr);
- }
-+
-+static int llama_apply_lora_from_file_internal(
-+    const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
-+) {
-+    LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
-+
-+    const int64_t t_start_lora_us = ggml_time_us();
-+
-+    llama_file fin(path_lora, "rb");
-+
-+    // verify magic and version
-+    {
-+        uint32_t magic = fin.read_u32();
-+        if (magic != LLAMA_FILE_MAGIC_GGLA) {
-+            LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
-+            return 1;
-+        }
-+
-+        uint32_t format_version = fin.read_u32();
-+        if (format_version != 1) {
-+            LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
-+            return 1;
-+        }
-+    }
-+
-+    int32_t lora_r = fin.read_u32();
-+    int32_t lora_alpha = fin.read_u32();
-+    float scaling = scale * (float)lora_alpha / (float)lora_r;
-+
-+    LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
-+
-+    // load base model
-+    std::unique_ptr<llama_model_loader> ml;
-+    if (path_base_model) {
-+        LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
-+        ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
-+        ml->init_mappings(/*prefetch*/ false); // no prefetching
-+    }
-+
-+    struct tensor_meta {
-+        std::string name;
-+        ggml_type type;
-+        int32_t ne[2];
-+        size_t offset;
-+    };
-+    std::map<std::string, tensor_meta> tensor_meta_map;
-+
-+    // load all tensor meta
-+    while (true) {
-+        if (fin.tell() == fin.size) {
-+            // eof
-+            break;
-+        }
-+
-+        int32_t n_dims;
-+        int32_t name_len;
-+        int32_t ftype;
-+
-+        fin.read_raw(&n_dims, sizeof(n_dims));
-+        fin.read_raw(&name_len, sizeof(name_len));
-+        fin.read_raw(&ftype, sizeof(ftype));
-+
-+        if (n_dims != 1 && n_dims != 2) {
-+            LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
-+            return 1;
-+        }
-+
-+        int32_t ne[2] = { 1, 1 };
-+        for (int i = 0; i < n_dims; ++i) {
-+            fin.read_raw(&ne[i], sizeof(ne[i]));
-+        }
-+
-+        std::string name;
-+        {
-+            GGML_ASSERT(name_len < GGML_MAX_NAME);
-+            char buf[GGML_MAX_NAME];
-+            fin.read_raw(buf, name_len);
-+            name = std::string(buf, name_len);
-+        }
-+
-+        // check for lora suffix
-+        std::string lora_suffix;
-+        if (name.length() > 6) {
-+            lora_suffix = name.substr(name.length() - 6);
-+        }
-+        if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
-+            LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
-+            return 1;
-+        }
-+
-+        // tensor type
-+        ggml_type wtype;
-+        switch (ftype) {
-+            case 0: wtype = GGML_TYPE_F32;  break;
-+            case 1: wtype = GGML_TYPE_F16;  break;
-+            default:
-+                    {
-+                        LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
-+                                __func__, ftype);
-+                        return 1;
-+                    }
-+        }
-+
-+        // data offset
-+        size_t offset = fin.tell();
-+        offset = (offset + 31) & -32;
-+
-+        // skip tensor data
-+        fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
-+
-+        tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
-+    }
-+
-+    bool warned = false;
-+    int n_tensors = 0;
-+
-+    // apply
-+    ggml_backend_t backend_cpu = ggml_backend_cpu_init();
-+    if (backend_cpu == nullptr) {
-+        LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
-+        return 1;
-+    }
-+    ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
-+
-+    std::vector<no_init<uint8_t>> read_buf;
-+    for (const auto & it : model.tensors_by_name) {
-+        const std::string & base_name = it.first;
-+        ggml_tensor * model_t = it.second;
-+
-+        if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
-+            tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
-+            continue;
-+        }
-+
-+        tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
-+        tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
-+
-+        ggml_init_params lora_init_params = {
-+            /* .mem_size   */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
-+            /* .mem_buffer */ nullptr,
-+            /* .no_alloc   */ true,
-+        };
-+        ggml_context * lora_ctx = ggml_init(lora_init_params);
-+        if (lora_ctx == nullptr) {
-+            LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
-+            ggml_backend_free(backend_cpu);
-+            return 1;
-+        }
-+
-+        // create tensors
-+        ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
-+        ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
-+        ggml_set_name(loraA, metaA.name.c_str());
-+        ggml_set_name(loraB, metaB.name.c_str());
-+
-+        ggml_tensor * base_t;
-+        if (ml) {
-+            if (!ml->get_tensor_meta(base_name.c_str())) {
-+                LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
-+                return 1;
-+            }
-+            base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
-+        } else {
-+            base_t = ggml_dup_tensor(lora_ctx, model_t);
-+        }
-+        ggml_set_name(base_t, base_name.c_str());
-+
-+        // allocate in backend buffer
-+        ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
-+        if (lora_buf == nullptr) {
-+            LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
-+            return 1;
-+        }
-+
-+        // load tensor data
-+        auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
-+            read_buf.resize(ggml_nbytes(tensor));
-+            fin.seek(tensor_meta.offset, SEEK_SET);
-+            fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
-+            ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
-+        };
-+        load_tensor(metaA, loraA);
-+        load_tensor(metaB, loraB);
-+
-+        // load base model tensor data
-+        if (ml) {
-+            ml->load_data_for(base_t);
-+        } else {
-+            ggml_backend_tensor_copy(model_t, base_t);
-+        }
-+
-+        if (ggml_is_quantized(base_t->type) && !warned) {
-+            LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
-+                            "use a f16 or f32 base model with --lora-base\n", __func__);
-+            warned = true;
-+        }
-+
-+        if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
-+            LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
-+                            " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
-+            ggml_free(lora_ctx);
-+            ggml_backend_buffer_free(lora_buf);
-+            ggml_backend_free(backend_cpu);
-+            return 1;
-+        }
-+
-+        auto build_lora_graph = [&]() {
-+            // w = w + BA*s
-+            ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
-+            ggml_set_name(BA, "BA");
-+
-+            if (scaling != 1.0f) {
-+                BA = ggml_scale(lora_ctx, BA, scaling);
-+                ggml_set_name(BA, "BA_scaled");
-+            }
-+
-+            ggml_tensor * r;
-+            r = ggml_add_inplace(lora_ctx, base_t, BA);
-+            ggml_set_name(r, "r_add");
-+
-+            if (base_t->type != model_t->type) {
-+                // convert the result to the model type
-+                r = ggml_cast(lora_ctx, r, model_t->type);
-+                ggml_set_name(r, "r_cast");
-+            }
-+
-+            return r;
-+        };
-+
-+        ggml_cgraph * gf = ggml_new_graph(lora_ctx);
-+        ggml_tensor * r = build_lora_graph();
-+        ggml_build_forward_expand(gf, r);
-+
-+        ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
-+        if (graph_buf == nullptr) {
-+            LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
-+            ggml_free(lora_ctx);
-+            ggml_backend_buffer_free(lora_buf);
-+            ggml_backend_free(backend_cpu);
-+            return 1;
-+        }
-+
-+        ggml_backend_graph_compute(backend_cpu, gf);
-+
-+        ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
-+
-+#if 0
-+        // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
-+        //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
-+
-+        // sched compute
-+        ggml_build_forward_expand(gf, build_graph());
-+        ggml_backend_sched_init_measure(sched, gf);
-+
-+        // create the graph again, since the previous one was destroyed by the measure
-+        ggml_graph_clear(gf);
-+        ggml_build_forward_expand(gf, build_graph());
-+        ggml_backend_sched_graph_compute(sched, gf);
-+        ggml_backend_sched_free(sched);
-+#endif
-+
-+        ggml_backend_buffer_free(lora_buf);
-+        ggml_backend_buffer_free(graph_buf);
-+        ggml_free(lora_ctx);
-+
-+        n_tensors++;
-+        if (n_tensors % 4 == 0) {
-+            LLAMA_LOG_INFO(".");
-+        }
-+    }
-+
-+    ggml_backend_free(backend_cpu);
-+
-+    const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
-+    LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
-+
-+    return 0;
-+}
-+
-+int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
-+    try {
-+        return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
-+    } catch (const std::exception & err) {
-+        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
-+        return 1;
-+    }
-+}
-\ No newline at end of file
--- a/llm/patches/11-phi3-sliding-window.diff
+++ b/llm/patches/11-phi3-sliding-window.diff
@ -1,43 +0,0 @@
-From 6eedae4cf2fcc8015dac79cb3f28f61fcabacab2 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Wed, 31 Jul 2024 14:57:04 -0700
-Subject: [PATCH] phi3 sliding window
-
---
- src/llama.cpp | 6 +++---
- 1 file changed, 3 insertions(+), 3 deletions(-)
-
-diff --git a/src/llama.cpp b/src/llama.cpp
-index a207451f..f2872d4e 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -4893,7 +4893,7 @@ static void llm_load_hparams(
-             } break;
-         case LLM_ARCH_PHI3:
-             {
-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
-+                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
-                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- 
-                 switch (hparams.n_layer) {
-@@ -10762,7 +10762,7 @@ struct llm_build_context {
-         struct ggml_tensor * inp_pos = build_inp_pos();
- 
-         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
-+        struct ggml_tensor * KQ_mask = hparams.n_swa > 0 ? build_inp_KQ_mask_swa() : build_inp_KQ_mask();
- 
-         for (int il = 0; il < n_layer; ++il) {
-             auto residual = inpL;
-@@ -10820,7 +10820,7 @@ struct llm_build_context {
- 
-                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il);
-+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
-             }
- 
-             if (il == n_layer - 1) {
-- 
-2.45.2
-
--- a/llm/server.go
+++ b/llm/server.go
@ -98,7 +98,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		systemTotalMemory = systemMemInfo.TotalMemory
 		systemFreeMemory = systemMemInfo.FreeMemory
 		systemSwapFreeMemory = systemMemInfo.FreeSwap
-		slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
+		slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
 	}

 	// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
@ -258,7 +258,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--mlock")
 	}

-	if gpu.IsNUMA() {
+	if gpu.IsNUMA() && gpus[0].Library == "cpu" {
 		numaMode := "distribute"
 		if runtime.GOOS == "linux" {
 			if _, err := exec.LookPath("numactl"); err == nil {
@ -409,7 +409,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}

 		if err = s.cmd.Start(); err != nil {
-			// Detect permission denied and augment them essage about noexec
+			// Detect permission denied and augment the message about noexec
 			if errors.Is(err, os.ErrPermission) {
 				finalErr = fmt.Errorf("unable to start server %w.  %s may have noexec set.  Set OLLAMA_TMPDIR for server to a writable executable directory", err, dir)
 				continue
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@ -122,8 +122,8 @@ function buildOllama() {
            /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    }
-    New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\bin\ -Force
-    cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\bin\
+    New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force
+    cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\
 }

 function buildApp() {
--- a/scripts/install.sh
+++ b/scripts/install.sh
@ -38,7 +38,7 @@ IS_WSL2=false
 KERN=$(uname -r)
 case "$KERN" in
    *icrosoft*WSL2 | *icrosoft*wsl2) IS_WSL2=true;;
-    *icrosoft) error "Microsoft WSL1 is not currently supported. Please upgrade to WSL2 with 'wsl --set-version <distro> 2'" ;;
+    *icrosoft) error "Microsoft WSL1 is not currently supported. Please use WSL2 with 'wsl --set-version <distro> 2'" ;;
    *) ;;
 esac

--- a/scripts/rh_linux_deps.sh
+++ b/scripts/rh_linux_deps.sh
@ -30,7 +30,7 @@ if grep -i "centos" /etc/system-release >/dev/null; then
        dnf install -y rh-git227-git
        ln -s /opt/rh/rh-git227/root/usr/bin/git /usr/local/bin/git
    fi
-    dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++ pigz
+    dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++ pigz findutils
 elif grep -i "rocky" /etc/system-release >/dev/null; then
    # Temporary workaround until rocky 8 AppStream ships GCC 10.4 (10.3 is incompatible with NVCC)
    cat << EOF > /etc/yum.repos.d/Rocky-Vault.repo
@ -45,6 +45,7 @@ EOF
    dnf install -y git \
        gcc-toolset-10-gcc-10.2.1-8.2.el8 \
        gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 \
+        findutils \
        pigz
 else
    echo "ERROR Unexpected distro"
--- a/server/images.go
+++ b/server/images.go
@ -369,13 +369,14 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 	parameters := make(map[string]any)

 	var layers []Layer
+	var baseLayers []*layerGGML
 	for _, c := range modelfile.Commands {
 		mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name)
+		command := c.Name

-		switch c.Name {
+		switch command {
 		case "model", "adapter":
-			var baseLayers []*layerGGML
-			if name := model.ParseName(c.Args); name.IsValid() {
+			if name := model.ParseName(c.Args); name.IsValid() && command == "model" {
 				baseLayers, err = parseFromModel(ctx, name, fn)
 				if err != nil {
 					return err
@ -409,14 +410,14 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 				}
 				defer blob.Close()

-				baseLayers, err = parseFromFile(ctx, blob, digest, fn)
+				baseLayers, err = parseFromFile(ctx, command, baseLayers, blob, digest, fn)
 				if err != nil {
 					return err
 				}
 			} else if file, err := os.Open(realpath(modelFileDir, c.Args)); err == nil {
 				defer file.Close()

-				baseLayers, err = parseFromFile(ctx, file, "", fn)
+				baseLayers, err = parseFromFile(ctx, command, baseLayers, file, "", fn)
 				if err != nil {
 					return err
 				}
--- a/server/model.go
+++ b/server/model.go
@ -81,7 +81,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
 	return layers, nil
 }

-func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
+func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML, f *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
 	fi, err := f.Stat()
 	if err != nil {
 		return nil, err
@ -108,16 +108,38 @@ func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api.
 	defer t.Close()
 	defer os.Remove(t.Name())

-	fn(api.ProgressResponse{Status: "converting model"})
-	if err := convert.Convert(convert.NewZipReader(r, p, 32<<20), t); err != nil {
-		return nil, err
+	var layerType string
+
+	switch command {
+	case "adapter":
+		var baseModel *llm.GGML
+		for _, l := range baseLayers {
+			if l.GGML != nil {
+				baseModel = l.GGML
+				break
+			}
+		}
+
+		if baseModel == nil {
+			return nil, fmt.Errorf("no base model specified for the adapter")
+		}
+
+		if err := convert.ConvertAdapter(convert.NewZipReader(r, p, 32<<20), t, baseModel.KV()); err != nil {
+			return nil, err
+		}
+		layerType = "application/vnd.ollama.image.adapter"
+	case "model":
+		if err := convert.ConvertModel(convert.NewZipReader(r, p, 32<<20), t); err != nil {
+			return nil, err
+		}
+		layerType = "application/vnd.ollama.image.model"
 	}

 	if _, err := t.Seek(0, io.SeekStart); err != nil {
 		return nil, err
 	}

-	layer, err := NewLayer(t, "application/vnd.ollama.image.model")
+	layer, err := NewLayer(t, layerType)
 	if err != nil {
 		return nil, err
 	}
@ -139,7 +161,7 @@ func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api.
 	return detectChatTemplate(layers)
 }

-func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
+func parseFromFile(ctx context.Context, command string, baseLayers []*layerGGML, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
 	sr := io.NewSectionReader(file, 0, 512)
 	contentType, err := detectContentType(sr)
 	if err != nil {
@ -150,7 +172,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
 	case "gguf", "ggla":
 		// noop
 	case "application/zip":
-		return parseFromZipFile(ctx, file, digest, fn)
+		return parseFromZipFile(ctx, command, baseLayers, file, digest, fn)
 	default:
 		return nil, fmt.Errorf("unsupported content type: %s", contentType)
 	}
@ -170,7 +192,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
 		}

 		mediatype := "application/vnd.ollama.image.model"
-		if ggml.Name() == "ggla" {
+		if ggml.Name() == "ggla" || ggml.KV().Kind() == "adapter" {
 			mediatype = "application/vnd.ollama.image.adapter"
 		} else if ggml.KV().Architecture() == "clip" {
 			mediatype = "application/vnd.ollama.image.projector"
--- a/server/model_test.go
+++ b/server/model_test.go
@ -139,6 +139,7 @@ The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`,

 func TestParseFromFileFromLayer(t *testing.T) {
 	tempModels := t.TempDir()
+	t.Setenv("OLLAMA_MODELS", tempModels)

 	file, err := os.CreateTemp(tempModels, "")
 	if err != nil {
@ -153,7 +154,7 @@ func TestParseFromFileFromLayer(t *testing.T) {
 		t.Fatalf("failed to seek to start: %v", err)
 	}

-	layers, err := parseFromFile(context.Background(), file, "", func(api.ProgressResponse) {})
+	layers, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file, "", func(api.ProgressResponse) {})
 	if err != nil {
 		t.Fatalf("failed to parse from file: %v", err)
 	}
@ -166,7 +167,7 @@ func TestParseFromFileFromLayer(t *testing.T) {
 		t.Fatalf("failed to seek to start: %v", err)
 	}

-	layers2, err := parseFromFile(context.Background(), file, layers[0].Digest, func(api.ProgressResponse) {})
+	layers2, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file, layers[0].Digest, func(api.ProgressResponse) {})
 	if err != nil {
 		t.Fatalf("failed to parse from file: %v", err)
 	}
@ -189,6 +190,7 @@ func TestParseFromFileFromLayer(t *testing.T) {

 func TestParseLayerFromCopy(t *testing.T) {
 	tempModels := t.TempDir()
+	t.Setenv("OLLAMA_MODELS", tempModels)

 	file2, err := os.CreateTemp(tempModels, "")
 	if err != nil {
@ -206,7 +208,7 @@ func TestParseLayerFromCopy(t *testing.T) {
 		t.Fatalf("failed to seek to start: %v", err)
 	}

-	layers, err := parseFromFile(context.Background(), file2, "", func(api.ProgressResponse) {})
+	layers, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file2, "", func(api.ProgressResponse) {})
 	if err != nil {
 		t.Fatalf("failed to parse from file: %v", err)
 	}
--- a/server/modelpath.go
+++ b/server/modelpath.go
@ -73,18 +73,6 @@ func ParseModelPath(name string) ModelPath {

 var errModelPathInvalid = errors.New("invalid model path")

-func (mp ModelPath) Validate() error {
-	if mp.Repository == "" {
-		return fmt.Errorf("%w: model repository name is required", errModelPathInvalid)
-	}
-
-	if strings.Contains(mp.Tag, ":") {
-		return fmt.Errorf("%w: ':' (colon) is not allowed in tag names", errModelPathInvalid)
-	}
-
-	return nil
-}
-
 func (mp ModelPath) GetNamespaceRepository() string {
 	return fmt.Sprintf("%s/%s", mp.Namespace, mp.Repository)
 }
@ -105,7 +93,11 @@ func (mp ModelPath) GetShortTagname() string {

 // GetManifestPath returns the path to the manifest file for the given model path, it is up to the caller to create the directory if it does not exist.
 func (mp ModelPath) GetManifestPath() (string, error) {
-	return filepath.Join(envconfig.Models(), "manifests", mp.Registry, mp.Namespace, mp.Repository, mp.Tag), nil
+	if p := filepath.Join(mp.Registry, mp.Namespace, mp.Repository, mp.Tag); filepath.IsLocal(p) {
+		return filepath.Join(envconfig.Models(), "manifests", p), nil
+	}
+
+	return "", errModelPathInvalid
 }

 func (mp ModelPath) BaseURL() *url.URL {
--- a/server/modelpath_test.go
+++ b/server/modelpath_test.go
@ -1,6 +1,7 @@
 package server

 import (
+	"errors"
 	"os"
 	"path/filepath"
 	"testing"
@ -154,3 +155,10 @@ func TestParseModelPath(t *testing.T) {
 		})
 	}
 }
+
+func TestInsecureModelpath(t *testing.T) {
+	mp := ParseModelPath("../../..:something")
+	if _, err := mp.GetManifestPath(); !errors.Is(err, errModelPathInvalid) {
+		t.Errorf("expected error: %v", err)
+	}
+}
--- a/server/routes.go
+++ b/server/routes.go
@ -463,7 +463,7 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
 	c.JSON(http.StatusOK, resp)
 }

-func (s *Server) PullModelHandler(c *gin.Context) {
+func (s *Server) PullHandler(c *gin.Context) {
 	var req api.PullRequest
 	err := c.ShouldBindJSON(&req)
 	switch {
@ -513,7 +513,7 @@ func (s *Server) PullModelHandler(c *gin.Context) {
 	streamResponse(c, ch)
 }

-func (s *Server) PushModelHandler(c *gin.Context) {
+func (s *Server) PushHandler(c *gin.Context) {
 	var req api.PushRequest
 	err := c.ShouldBindJSON(&req)
 	switch {
@ -577,7 +577,7 @@ func checkNameExists(name model.Name) error {
 	return nil
 }

-func (s *Server) CreateModelHandler(c *gin.Context) {
+func (s *Server) CreateHandler(c *gin.Context) {
 	var r api.CreateRequest
 	if err := c.ShouldBindJSON(&r); errors.Is(err, io.EOF) {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
@ -647,7 +647,7 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
 	streamResponse(c, ch)
 }

-func (s *Server) DeleteModelHandler(c *gin.Context) {
+func (s *Server) DeleteHandler(c *gin.Context) {
 	var r api.DeleteRequest
 	if err := c.ShouldBindJSON(&r); errors.Is(err, io.EOF) {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
@ -680,7 +680,7 @@ func (s *Server) DeleteModelHandler(c *gin.Context) {
 	}
 }

-func (s *Server) ShowModelHandler(c *gin.Context) {
+func (s *Server) ShowHandler(c *gin.Context) {
 	var req api.ShowRequest
 	err := c.ShouldBindJSON(&req)
 	switch {
@ -829,7 +829,7 @@ func getKVData(digest string, verbose bool) (llm.KV, error) {
 	return kv, nil
 }

-func (s *Server) ListModelsHandler(c *gin.Context) {
+func (s *Server) ListHandler(c *gin.Context) {
 	ms, err := Manifests()
 	if err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
@ -879,7 +879,7 @@ func (s *Server) ListModelsHandler(c *gin.Context) {
 	c.JSON(http.StatusOK, api.ListResponse{Models: models})
 }

-func (s *Server) CopyModelHandler(c *gin.Context) {
+func (s *Server) CopyHandler(c *gin.Context) {
 	var r api.CopyRequest
 	if err := c.ShouldBindJSON(&r); errors.Is(err, io.EOF) {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
@ -1081,33 +1081,33 @@ func (s *Server) GenerateRoutes() http.Handler {
 		allowedHostsMiddleware(s.addr),
 	)

-	r.POST("/api/pull", s.PullModelHandler)
+	r.POST("/api/pull", s.PullHandler)
 	r.POST("/api/generate", s.GenerateHandler)
 	r.POST("/api/chat", s.ChatHandler)
 	r.POST("/api/embed", s.EmbedHandler)
 	r.POST("/api/embeddings", s.EmbeddingsHandler)
-	r.POST("/api/create", s.CreateModelHandler)
-	r.POST("/api/push", s.PushModelHandler)
-	r.POST("/api/copy", s.CopyModelHandler)
-	r.DELETE("/api/delete", s.DeleteModelHandler)
-	r.POST("/api/show", s.ShowModelHandler)
+	r.POST("/api/create", s.CreateHandler)
+	r.POST("/api/push", s.PushHandler)
+	r.POST("/api/copy", s.CopyHandler)
+	r.DELETE("/api/delete", s.DeleteHandler)
+	r.POST("/api/show", s.ShowHandler)
 	r.POST("/api/blobs/:digest", s.CreateBlobHandler)
 	r.HEAD("/api/blobs/:digest", s.HeadBlobHandler)
-	r.GET("/api/ps", s.ProcessHandler)
+	r.GET("/api/ps", s.PsHandler)

 	// Compatibility endpoints
 	r.POST("/v1/chat/completions", openai.ChatMiddleware(), s.ChatHandler)
 	r.POST("/v1/completions", openai.CompletionsMiddleware(), s.GenerateHandler)
 	r.POST("/v1/embeddings", openai.EmbeddingsMiddleware(), s.EmbedHandler)
-	r.GET("/v1/models", openai.ListMiddleware(), s.ListModelsHandler)
-	r.GET("/v1/models/:model", openai.RetrieveMiddleware(), s.ShowModelHandler)
+	r.GET("/v1/models", openai.ListMiddleware(), s.ListHandler)
+	r.GET("/v1/models/:model", openai.RetrieveMiddleware(), s.ShowHandler)

 	for _, method := range []string{http.MethodGet, http.MethodHead} {
 		r.Handle(method, "/", func(c *gin.Context) {
 			c.String(http.StatusOK, "Ollama is running")
 		})

-		r.Handle(method, "/api/tags", s.ListModelsHandler)
+		r.Handle(method, "/api/tags", s.ListHandler)
 		r.Handle(method, "/api/version", func(c *gin.Context) {
 			c.JSON(http.StatusOK, gin.H{"version": version.Version})
 		})
@ -1269,7 +1269,7 @@ func streamResponse(c *gin.Context, ch chan any) {
 	})
 }

-func (s *Server) ProcessHandler(c *gin.Context) {
+func (s *Server) PsHandler(c *gin.Context) {
 	models := []api.ProcessModelResponse{}

 	for _, v := range s.sched.loaded {
--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@ -93,7 +93,7 @@ func TestCreateFromBin(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", p)

 	var s Server
-	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@ -120,7 +120,7 @@ func TestCreateFromModel(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", p)
 	var s Server

-	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@ -134,7 +134,7 @@ func TestCreateFromModel(t *testing.T) {
 		filepath.Join(p, "manifests", "registry.ollama.ai", "library", "test", "latest"),
 	})

-	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test2",
 		Modelfile: "FROM test",
 		Stream:    &stream,
@ -162,7 +162,7 @@ func TestCreateRemovesLayers(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", p)
 	var s Server

-	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt }}", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@ -182,7 +182,7 @@ func TestCreateRemovesLayers(t *testing.T) {
 		filepath.Join(p, "blobs", "sha256-bc80b03733773e0728011b2f4adf34c458b400e1aad48cb28d61170f3a2ad2d6"),
 	})

-	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .System }} {{ .Prompt }}", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@ -210,7 +210,7 @@ func TestCreateUnsetsSystem(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", p)
 	var s Server

-	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s\nSYSTEM Say hi!", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@ -230,7 +230,7 @@ func TestCreateUnsetsSystem(t *testing.T) {
 		filepath.Join(p, "blobs", "sha256-f29e82a8284dbdf5910b1555580ff60b04238b8da9d5e51159ada67a4d0d5851"),
 	})

-	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s\nSYSTEM \"\"", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@ -267,7 +267,7 @@ func TestCreateMergeParameters(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", p)
 	var s Server

-	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s\nPARAMETER temperature 1\nPARAMETER top_k 10\nPARAMETER stop USER:\nPARAMETER stop ASSISTANT:", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@ -288,7 +288,7 @@ func TestCreateMergeParameters(t *testing.T) {
 	})

 	// in order to merge parameters, the second model must be created FROM the first
-	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test2",
 		Modelfile: "FROM test\nPARAMETER temperature 0.6\nPARAMETER top_p 0.7",
 		Stream:    &stream,
@ -326,7 +326,7 @@ func TestCreateMergeParameters(t *testing.T) {
 	}

 	// slices are replaced
-	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test2",
 		Modelfile: "FROM test\nPARAMETER temperature 0.6\nPARAMETER top_p 0.7\nPARAMETER stop <|endoftext|>",
 		Stream:    &stream,
@ -371,7 +371,7 @@ func TestCreateReplacesMessages(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", p)
 	var s Server

-	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s\nMESSAGE assistant \"What is my purpose?\"\nMESSAGE user \"You run tests.\"\nMESSAGE assistant \"Oh, my god.\"", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@ -391,7 +391,7 @@ func TestCreateReplacesMessages(t *testing.T) {
 		filepath.Join(p, "blobs", "sha256-e0e27d47045063ccb167ae852c51d49a98eab33fabaee4633fdddf97213e40b5"),
 	})

-	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test2",
 		Modelfile: "FROM test\nMESSAGE assistant \"You're a test, Harry.\"\nMESSAGE user \"I-I'm a what?\"\nMESSAGE assistant \"A test. And a thumping good one at that, I'd wager.\"",
 		Stream:    &stream,
@ -448,7 +448,7 @@ func TestCreateTemplateSystem(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", p)
 	var s Server

-	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt }}\nSYSTEM Say hello!\nTEMPLATE {{ .System }} {{ .Prompt }}\nSYSTEM Say bye!", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@ -488,7 +488,7 @@ func TestCreateTemplateSystem(t *testing.T) {
 	}

 	t.Run("incomplete template", func(t *testing.T) {
-		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Name:      "test",
 			Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt", createBinFile(t, nil, nil)),
 			Stream:    &stream,
@ -500,7 +500,7 @@ func TestCreateTemplateSystem(t *testing.T) {
 	})

 	t.Run("template with unclosed if", func(t *testing.T) {
-		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Name:      "test",
 			Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ if .Prompt }}", createBinFile(t, nil, nil)),
 			Stream:    &stream,
@ -512,7 +512,7 @@ func TestCreateTemplateSystem(t *testing.T) {
 	})

 	t.Run("template with undefined function", func(t *testing.T) {
-		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Name:      "test",
 			Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{  Prompt }}", createBinFile(t, nil, nil)),
 			Stream:    &stream,
@ -531,7 +531,7 @@ func TestCreateLicenses(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", p)
 	var s Server

-	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s\nLICENSE MIT\nLICENSE Apache-2.0", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@ -579,7 +579,7 @@ func TestCreateDetectTemplate(t *testing.T) {
 	var s Server

 	t.Run("matched", func(t *testing.T) {
-		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Name: "test",
 			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
 				"tokenizer.chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
@ -593,14 +593,14 @@ func TestCreateDetectTemplate(t *testing.T) {

 		checkFileExists(t, filepath.Join(p, "blobs", "*"), []string{
 			filepath.Join(p, "blobs", "sha256-0d79f567714c62c048378f2107fb332dabee0135d080c302d884317da9433cc5"),
+			filepath.Join(p, "blobs", "sha256-35360843d0c84fb1506952a131bbef13cd2bb4a541251f22535170c05b56e672"),
 			filepath.Join(p, "blobs", "sha256-553c4a3f747b3d22a4946875f1cc8ed011c2930d83f864a0c7265f9ec0a20413"),
-			filepath.Join(p, "blobs", "sha256-c608dc615584cd20d9d830363dabf8a4783ae5d34245c3d8c115edb3bc7b28e4"),
-			filepath.Join(p, "blobs", "sha256-ea34c57ba5b78b740aafe2aeb74dc6507fc3ad14170b64c26a04fb9e36c88d75"),
+			filepath.Join(p, "blobs", "sha256-de3959f841e9ef6b4b6255fa41cb9e0a45da89c3066aa72bdd07a4747f848990"),
 		})
 	})

 	t.Run("unmatched", func(t *testing.T) {
-		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Name:      "test",
 			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 			Stream:    &stream,
--- a/server/routes_delete_test.go
+++ b/server/routes_delete_test.go
@ -22,7 +22,7 @@ func TestDelete(t *testing.T) {

 	var s Server

-	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 	})
@ -31,7 +31,7 @@ func TestDelete(t *testing.T) {
 		t.Fatalf("expected status code 200, actual %d", w.Code)
 	}

-	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      "test2",
 		Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .System }} {{ .Prompt }}", createBinFile(t, nil, nil)),
 	})
@ -52,7 +52,7 @@ func TestDelete(t *testing.T) {
 		filepath.Join(p, "blobs", "sha256-fe7ac77b725cda2ccad03f88a880ecdfd7a33192d6cae08fce2c0ee1455991ed"),
 	})

-	w = createRequest(t, s.DeleteModelHandler, api.DeleteRequest{Name: "test"})
+	w = createRequest(t, s.DeleteHandler, api.DeleteRequest{Name: "test"})

 	if w.Code != http.StatusOK {
 		t.Fatalf("expected status code 200, actual %d", w.Code)
@ -68,7 +68,7 @@ func TestDelete(t *testing.T) {
 		filepath.Join(p, "blobs", "sha256-fe7ac77b725cda2ccad03f88a880ecdfd7a33192d6cae08fce2c0ee1455991ed"),
 	})

-	w = createRequest(t, s.DeleteModelHandler, api.DeleteRequest{Name: "test2"})
+	w = createRequest(t, s.DeleteHandler, api.DeleteRequest{Name: "test2"})

 	if w.Code != http.StatusOK {
 		t.Fatalf("expected status code 200, actual %d", w.Code)
@ -102,7 +102,7 @@ func TestDeleteDuplicateLayers(t *testing.T) {
 		t.Fatal(err)
 	}

-	w := createRequest(t, s.DeleteModelHandler, api.DeleteRequest{Name: "test"})
+	w := createRequest(t, s.DeleteHandler, api.DeleteRequest{Name: "test"})
 	if w.Code != http.StatusOK {
 		t.Errorf("expected status code 200, actual %d", w.Code)
 	}
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@ -84,7 +84,7 @@ func TestGenerateChat(t *testing.T) {

 	go s.sched.Run(context.TODO())

-	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Model: "test",
 		Modelfile: fmt.Sprintf(`FROM %s
 		TEMPLATE """
@ -144,7 +144,7 @@ func TestGenerateChat(t *testing.T) {
 	})

 	t.Run("missing capabilities chat", func(t *testing.T) {
-		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Model: "bert",
 			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
 				"general.architecture": "bert",
@ -270,7 +270,7 @@ func TestGenerateChat(t *testing.T) {
 		checkChatResponse(t, w.Body, "test", "Hi!")
 	})

-	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateHandler, api.CreateRequest{
 		Model:     "test-system",
 		Modelfile: "FROM test\nSYSTEM You are a helpful assistant.",
 	})
@ -382,7 +382,7 @@ func TestGenerate(t *testing.T) {

 	go s.sched.Run(context.TODO())

-	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Model: "test",
 		Modelfile: fmt.Sprintf(`FROM %s
 		TEMPLATE """
@ -442,7 +442,7 @@ func TestGenerate(t *testing.T) {
 	})

 	t.Run("missing capabilities generate", func(t *testing.T) {
-		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Model: "bert",
 			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
 				"general.architecture": "bert",
@ -583,7 +583,7 @@ func TestGenerate(t *testing.T) {
 		checkGenerateResponse(t, w.Body, "test", "Hi!")
 	})

-	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateHandler, api.CreateRequest{
 		Model:     "test-system",
 		Modelfile: "FROM test\nSYSTEM You are a helpful assistant.",
 	})
@ -652,7 +652,7 @@ func TestGenerate(t *testing.T) {
 		checkGenerateResponse(t, w.Body, "test-system", "Abra kadabra!")
 	})

-	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateHandler, api.CreateRequest{
 		Model: "test-suffix",
 		Modelfile: `FROM test
 TEMPLATE """{{- if .Suffix }}<PRE> {{ .Prompt }} <SUF>{{ .Suffix }} <MID>
--- a/server/routes_list_test.go
+++ b/server/routes_list_test.go
@ -31,13 +31,13 @@ func TestList(t *testing.T) {

 	var s Server
 	for _, n := range expectNames {
-		createRequest(t, s.CreateModelHandler, api.CreateRequest{
+		createRequest(t, s.CreateHandler, api.CreateRequest{
 			Name:      n,
 			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 		})
 	}

-	w := createRequest(t, s.ListModelsHandler, nil)
+	w := createRequest(t, s.ListHandler, nil)
 	if w.Code != http.StatusOK {
 		t.Fatalf("expected status code 200, actual %d", w.Code)
 	}
--- a/server/routes_test.go
+++ b/server/routes_test.go
@ -318,7 +318,7 @@ func TestCase(t *testing.T) {
 	var s Server
 	for _, tt := range cases {
 		t.Run(tt, func(t *testing.T) {
-			w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+			w := createRequest(t, s.CreateHandler, api.CreateRequest{
 				Name:      tt,
 				Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 				Stream:    &stream,
@ -334,7 +334,7 @@ func TestCase(t *testing.T) {
 			}

 			t.Run("create", func(t *testing.T) {
-				w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
+				w = createRequest(t, s.CreateHandler, api.CreateRequest{
 					Name:      strings.ToUpper(tt),
 					Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 					Stream:    &stream,
@ -350,7 +350,7 @@ func TestCase(t *testing.T) {
 			})

 			t.Run("pull", func(t *testing.T) {
-				w := createRequest(t, s.PullModelHandler, api.PullRequest{
+				w := createRequest(t, s.PullHandler, api.PullRequest{
 					Name:   strings.ToUpper(tt),
 					Stream: &stream,
 				})
@ -365,7 +365,7 @@ func TestCase(t *testing.T) {
 			})

 			t.Run("copy", func(t *testing.T) {
-				w := createRequest(t, s.CopyModelHandler, api.CopyRequest{
+				w := createRequest(t, s.CopyHandler, api.CopyRequest{
 					Source:      tt,
 					Destination: strings.ToUpper(tt),
 				})
@ -387,7 +387,7 @@ func TestShow(t *testing.T) {

 	var s Server

-	createRequest(t, s.CreateModelHandler, api.CreateRequest{
+	createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name: "show-model",
 		Modelfile: fmt.Sprintf(
 			"FROM %s\nFROM %s",
@ -396,7 +396,7 @@ func TestShow(t *testing.T) {
 		),
 	})

-	w := createRequest(t, s.ShowModelHandler, api.ShowRequest{
+	w := createRequest(t, s.ShowHandler, api.ShowRequest{
 		Name: "show-model",
 	})

--- a/template/alfred.gotmpl
+++ b/template/alfred.gotmpl
@ -1 +1,2 @@
-{{ if .System }}<start_system>{{ .System }}<end_message>{{ end }}{{ if .Prompt }}<start_user>{{ .Prompt }}<end_message>{{ end }}<start_assistant>{{ .Response }}<end_message>
+{{- range .Messages }}<start_{{ .Role }}>{{ .Content }}<end_message>
+{{- end }}<start_assistant>
--- a/template/alpaca.gotmpl
+++ b/template/alpaca.gotmpl
@ -1,8 +1,18 @@
-{{ if .System }}{{ .System }}
+{{- $system := "" }}
+{{- range .Messages }}
+{{- if eq .Role "system" }}
+{{- if not $system }}{{ $system = .Content }}
+{{- else }}{{ $system = printf "%s\n\n%s" $system .Content }}
+{{- end }}
+{{- else if eq .Role "user" }}
+{{- if $system }}{{ $system }}

-{{ end }}{{ if .Prompt }}### Instruction:
-{{ .Prompt }}
+{{ $system = "" }}
+{{- end }}### Instruction:
+{{ .Content }}

-{{ end }}### Response:
-{{ .Response }}
+{{ else if eq .Role "assistant" }}### Response:
+{{ .Content }}

+{{ end }}
+{{- end }}### Response:
--- a/template/chatml.gotmpl
+++ b/template/chatml.gotmpl
@ -1,6 +1,3 @@
-{{ if .System }}<|im_start|>system
-{{ .System }}<|im_end|>
-{{ end }}{{ if .Prompt }}<|im_start|>user
-{{ .Prompt }}<|im_end|>
+{{- range .Messages }}<|im_start|>{{ .Role }}
+{{ .Content }}<|im_end|>
 {{ end }}<|im_start|>assistant
-{{ .Response }}<|im_end|>
--- a/template/chatqa.gotmpl
+++ b/template/chatqa.gotmpl
@ -1,6 +1,7 @@
-{{ if .System }}System: {{ .System }}
-
-{{ end }}{{ if .Prompt }}User: {{ .Prompt }}
-
-{{ end }}Assistant: {{ .Response }}
+{{- range .Messages }}
+{{- if eq .Role "system" }}System:
+{{- else if eq .Role "user" }}User:
+{{- else if eq .Role "assistant" }}Assistant:
+{{- end }} {{ .Content }}

+{{ end }}Assistant:
--- a/template/codellama-70b-instruct.gotmpl
+++ b/template/codellama-70b-instruct.gotmpl
@ -1,10 +1,10 @@
-{{ if .System }}Source: system
-
- {{ .System }} <step> {{ end }}Source: user
-
- {{ .Prompt }} <step> Source: assistant
-{{- if not .Response }}
-Destination: user
+{{- range .Messages }}Source:
+{{- if eq .Role "system" }} system
+{{- else if eq .Role "user" }} user
+{{- else if eq .Role "assistant" }} assistant
 {{- end }}

- {{ .Response }} <step> 
+ {{ .Content }} <step> {{ end }}Source: assistant
+Destination: user
+
+ 
--- a/template/falcon-instruct.gotmpl
+++ b/template/falcon-instruct.gotmpl
@ -1,5 +1,8 @@
-{{ if .System }}System: {{ .System }}
-{{ end }}{{ if .Prompt }}User:
-{{ .Prompt }}
+{{- range .Messages }}
+{{- if eq .Role "system" }}System: {{ .Content }}
+{{ continue }}
+{{- else if eq .Role "user" }}User:
+{{- else if eq .Role "assistant" }}Falcon:
+{{- end }}
+{{ .Content }}
 {{ end }}Falcon:
-{{ .Response }}
--- a/template/gemma-instruct.gotmpl
+++ b/template/gemma-instruct.gotmpl
@ -1,5 +1,16 @@
-<start_of_turn>user
-{{ if .System }}{{ .System }}
-{{ end }}{{ .Prompt }}<end_of_turn>
-<start_of_turn>model
-{{ .Response }}<end_of_turn>
+{{- $system := "" }}
+{{- range .Messages }}
+{{- if eq .Role "system" }}
+{{- if not $system }}{{ $system = .Content }}
+{{- else }}{{ $system = printf "%s\n\n%s" $system .Content }}
+{{- end }}
+{{- continue }}
+{{- else if eq .Role "user" }}<start_of_turn>user
+{{- if $system }}
+{{ $system }}
+{{- $system = "" }}
+{{- end }}
+{{- else if eq .Role "assistant" }}<start_of_turn>model
+{{- end }}
+{{ .Content }}<end_of_turn>
+{{ end }}<start_of_turn>model
--- a/template/granite-instruct.gotmpl
+++ b/template/granite-instruct.gotmpl
@ -1,9 +1,8 @@
-{{ if .System }}System:
-{{ .System }}
-
-{{ end }}{{ if .Prompt }}Question:
-{{ .Prompt }}
+{{- range .Messages }}
+{{- if eq .Role "system" }}System:
+{{- else if eq .Role "user" }}Question:
+{{- else if eq .Role "assistant" }}Answer:
+{{- end }}
+{{ .Content }}

 {{ end }}Answer:
-{{ .Response }}
-
--- a/template/index.json
+++ b/template/index.json
@ -91,6 +91,10 @@
    "template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
    "name": "llama3-instruct"
  },
+  {
+    "template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n    {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n    {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n    {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0]['content']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n    {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n    {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n    {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n    {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n    {%- elif 'tool_calls' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n            {%- for arg_name, arg_val in tool_call.arguments | items %}\n                {{- arg_name + '=\"' + arg_val + '\"' }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- endif %}\n                {%- endfor %}\n            {{- \")\" }}\n        {%- else  %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n            {{- '\"parameters\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- \"}\" }}\n        {%- endif %}\n        {%- if builtin_tools is defined %}\n            {#- This means we're in ipython mode #}\n            {{- \"<|eom_id|>\" }}\n        {%- else %}\n            {{- \"<|eot_id|>\" }}\n        {%- endif %}\n    {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n        {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- \"<|eot_id|>\" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
+    "name": "llama3-instruct"
+  },
  {
    "template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'Question:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'system' %}\n{{ 'System:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Answer:\n'  + message['content'] + '\n\n' }}{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Answer:\n' }}{% endif %}{% endfor %}",
    "name": "granite-instruct"
--- a/template/llama2-chat.gotmpl
+++ b/template/llama2-chat.gotmpl
@ -1,6 +1,14 @@
-[INST] <<SYS>>
-{{- if .System }}
-{{ .System }}
-{{ end }}<</SYS>>
+{{- $system := "" }}[INST] {{ range .Messages }}
+{{- if eq .Role "system" }}
+{{- if not $system }}{{ $system = .Content }}
+{{- else }}{{ $system = printf "%s\n\n%s" $system .Content }}
+{{- end }}
+{{- else if eq .Role "user" }}<<SYS>>
+{{- if $system }}
+{{ $system }}
+{{ $system = "" }}
+{{- end }}<</SYS>>

-{{ .Prompt }} [/INST] {{ .Response }}</s><s>
+{{ .Content }} [/INST]
+{{- else if eq .Role "assistant" }} {{ .Content }}</s><s>[INST] {{ end }}
+{{- end }}
--- a/template/llama3-instruct.gotmpl
+++ b/template/llama3-instruct.gotmpl
@ -1,7 +1,5 @@
-{{ if .System }}<|start_header_id|>system<|end_header_id|>
+{{- range .Messages }}<|start_header_id|>{{ .Role }}<|end_header_id|>

-{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
+{{ .Content }}<|eot_id|>
+{{- end }}<|start_header_id|>assistant<|end_header_id|>

-{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
-
-{{ .Response }}<|eot_id|>
--- a/template/magicoder.gotmpl
+++ b/template/magicoder.gotmpl
@ -1,8 +1,17 @@
-{{ if .System }}{{ .System }}
+{{- $system := "" }}
+{{- range .Messages }}
+{{- if eq .Role "system" }}
+{{- if not $system }}{{ $system = .Content }}
+{{- else }}{{ $system = printf "%s\n\n%s" $system .Content }}
+{{- end }}
+{{- continue }}
+{{- else if eq .Role "user" }}
+{{- if $system }}{{ $system }}

-{{ end }}{{ if .Prompt }}@@ Instruction
-{{ .Prompt }}
+{{ $system = "" }}
+{{- end }}@@ Instruction
+{{- else if eq .Role "assistant" }}@@ Response
+{{- end }}
+{{ .Content }}

 {{ end }}@@ Response
-{{ .Response }}
-
--- a/template/mistral-instruct.gotmpl
+++ b/template/mistral-instruct.gotmpl
@ -1,3 +1,6 @@
-[INST] {{ if .System }}{{ .System }}
+[INST] {{ range $index, $_ := .Messages }}
+{{- if eq .Role "system" }}{{ .Content }}

-{{ end }}{{ .Prompt }}[/INST] {{ .Response }}</s>
+{{ else if eq .Role "user" }}{{ .Content }}[/INST]
+{{- else if eq .Role "assistant" }} {{ .Content }}</s>[INST] {{ end }}
+{{- end }}
--- a/template/openchat.gotmpl
+++ b/template/openchat.gotmpl
@ -1 +1,6 @@
-{{ if .System }}GPT4 Correct System: {{ .System }}<|end_of_turn|>{{ end }}GPT4 Correct User: {{ .Prompt }}<|end_of_turn|>GPT4 Correct Assistant: {{ .Response }}<|end_of_turn|>
+{{- range .Messages }}GPT4 Correct
+{{- if eq .Role "system" }} System:
+{{- else if eq .Role "user" }} User:
+{{- else if eq .Role "assistant" }} Assistant:
+{{- end }} {{ .Content }}<|end_of_turn|>
+{{- end }}GPT4 Correct Assistant:
--- a/template/phi-3.gotmpl
+++ b/template/phi-3.gotmpl
@ -1,6 +1,3 @@
-{{ if .System }}<|system|>
-{{ .System }}<|end|>
-{{ end }}{{ if .Prompt }}<|user|>
-{{ .Prompt }}<|end|>
+{{- range .Messages }}<|{{ .Role }}|>
+{{ .Content }}<|end|>
 {{ end }}<|assistant|>
-{{ .Response }}<|end|>
--- a/template/solar-instruct.gotmpl
+++ b/template/solar-instruct.gotmpl
@ -1,9 +1,11 @@
-{{ if .System }}### System:
-{{ .System }}
+{{- range .Messages }}
+{{- if eq .Role "system" }}### System:
+{{- else if eq .Role "user" }}### User:
+{{- else if eq .Role "assistant" }}### Assistant:
+{{ .Content }}</s>

-{{ end }}{{ if .Prompt }}### User:
-{{ .Prompt }}
+{{ continue }}
+{{- end }}
+{{ .Content }}

 {{ end }}### Assistant:
-{{ .Response }}</s>
-
--- a/template/starcoder2-instruct.gotmpl
+++ b/template/starcoder2-instruct.gotmpl
@ -1,8 +1,18 @@
-{{ if .System }}{{ .System }}
+{{- $system := "" }}
+{{- range .Messages }}
+{{- if eq .Role "system" }}
+{{- if not $system }}{{ $system = .Content }}
+{{- else }}{{ $system = printf "%s\n\n%s" $system .Content }}
+{{- end }}
+{{- else if eq .Role "user" }}
+{{- if $system }}{{ $system }}

-{{ end }}{{ if .Prompt }}### Instruction
-{{ .Prompt }}
+{{ $system = "" }}
+{{- end }}### Instruction
+{{ .Content }}

-{{ end }}### Response
-{{ .Response }}<|endoftext|>
+{{ else if eq .Role "assistant" }}### Response
+{{ .Content }}<|endoftext|>

+{{ end }}
+{{- end }}### Response
--- a/template/vicuna.gotmpl
+++ b/template/vicuna.gotmpl
@ -1,4 +1,14 @@
-{{ if .System }}{{ .System }}
+{{- $system := "" }}
+{{- range .Messages }}
+{{- if eq .Role "system" }}
+{{- if not $system }}{{ $system = .Content }}
+{{- else }}{{ $system = printf "%s\n\n%s" $system .Content }}
+{{- end }}
+{{- else if eq .Role "user" }}
+{{- if $system }}{{ $system }}

-{{ end }}{{ if .Prompt }}USER: {{ .Prompt }}
-{{ end }}ASSISTANT: {{ .Response }}</s>
+{{ $system = "" }}
+{{- end }}USER: {{ .Content }}
+{{ else if eq .Role "assistant" }}ASSISTANT: {{ .Content }}</s>
+{{ end }}
+{{- end }}ASSISTANT:
--- a/template/zephyr.gotmpl
+++ b/template/zephyr.gotmpl
@ -1,6 +1,3 @@
-{{ if .System }}<|system|>
-{{ .System }}</s>
-{{ end }}{{ if .Prompt }}<|user|>
-{{ .Prompt }}</s>
+{{- range .Messages }}<|{{ .Role }}|>
+{{ .Content }}</s>
 {{ end }}<|assistant|>
-{{ .Response }}</s>