diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 5ae630c3..f0c6db5d 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -31,7 +31,7 @@ jobs: security set-keychain-settings -lut 3600 build.keychain - uses: actions/setup-go@v5 with: - go-version-file: go.mod + go-version: "stable" cache: true - name: Build Darwin env: @@ -87,7 +87,7 @@ jobs: write-host "plugin installed" - uses: actions/setup-go@v5 with: - go-version-file: go.mod + go-version: "stable" cache: true - run: go get ./... - run: | @@ -141,7 +141,7 @@ jobs: write-host "plugin installed" - uses: actions/setup-go@v5 with: - go-version-file: go.mod + go-version: "stable" cache: true - name: 'Install ROCm' run: | @@ -218,7 +218,7 @@ jobs: write-host "plugin installed" - uses: actions/setup-go@v5 with: - go-version-file: go.mod + go-version: "stable" cache: true - name: 'Install CUDA' run: | @@ -306,7 +306,7 @@ jobs: write-host "plugin installed" - uses: actions/setup-go@v5 with: - go-version-file: go.mod + go-version: "stable" cache: true - run: go get - uses: actions/download-artifact@v4 diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 977d8da1..5e002a22 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -63,7 +63,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-go@v5 with: - go-version-file: go.mod + go-version: "stable" cache: true - run: go get ./... - run: | @@ -126,7 +126,7 @@ jobs: strategy: matrix: rocm-version: - - '6.1.1' + - '6.1.2' runs-on: linux container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }} steps: @@ -163,7 +163,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-go@v5 with: - go-version-file: go.mod + go-version: "stable" cache: true - name: 'Install ROCm' run: | @@ -200,7 +200,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-go@v5 with: - go-version-file: go.mod + go-version: "stable" cache: true - name: 'Install CUDA' run: | @@ -255,7 +255,7 @@ jobs: submodules: recursive - uses: actions/setup-go@v5 with: - go-version-file: go.mod + go-version: "stable" cache: false - run: | case ${{ matrix.arch }} in @@ -297,7 +297,7 @@ jobs: submodules: recursive - uses: actions/setup-go@v5 with: - go-version-file: go.mod + go-version: "stable" cache: true - run: | case ${{ matrix.arch }} in diff --git a/Dockerfile b/Dockerfile index b2c5c4a2..c8efdd8a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,8 @@ -ARG GOLANG_VERSION=1.22.1 +ARG GOLANG_VERSION=1.22.5 ARG CMAKE_VERSION=3.22.1 # this CUDA_VERSION corresponds with the one specified in docs/gpu.md ARG CUDA_VERSION=11.3.1 -ARG ROCM_VERSION=6.1.1 +ARG ROCM_VERSION=6.1.2 # Copy the minimal context we need to run the generate scripts FROM scratch AS llm-code diff --git a/README.md b/README.md index 62f5cd65..824b3761 100644 --- a/README.md +++ b/README.md @@ -35,10 +35,10 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla ## Quickstart -To run and chat with [Llama 3](https://ollama.com/library/llama3): +To run and chat with [Llama 3.1](https://ollama.com/library/llama3.1): ``` -ollama run llama3 +ollama run llama3.1 ``` ## Model library @@ -49,8 +49,9 @@ Here are some example models that can be downloaded: | Model | Parameters | Size | Download | | ------------------ | ---------- | ----- | ------------------------------ | -| Llama 3 | 8B | 4.7GB | `ollama run llama3` | -| Llama 3 | 70B | 40GB | `ollama run llama3:70b` | +| Llama 3.1 | 8B | 4.7GB | `ollama run llama3.1` | +| Llama 3.1 | 70B | 40GB | `ollama run llama3.1:70b` | +| Llama 3.1 | 405B | 231GB | `ollama run llama3.1:405b` | | Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` | | Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` | | Gemma 2 | 9B | 5.5GB | `ollama run gemma2` | @@ -64,7 +65,8 @@ Here are some example models that can be downloaded: | LLaVA | 7B | 4.5GB | `ollama run llava` | | Solar | 10.7B | 6.1GB | `ollama run solar` | -> Note: You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models. +> [!NOTE] +> You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models. ## Customize a model @@ -96,16 +98,16 @@ See the [guide](docs/import.md) on importing models for more information. ### Customize a prompt -Models from the Ollama library can be customized with a prompt. For example, to customize the `llama3` model: +Models from the Ollama library can be customized with a prompt. For example, to customize the `llama3.1` model: ``` -ollama pull llama3 +ollama pull llama3.1 ``` Create a `Modelfile`: ``` -FROM llama3 +FROM llama3.1 # set the temperature to 1 [higher is more creative, lower is more coherent] PARAMETER temperature 1 @@ -140,7 +142,7 @@ ollama create mymodel -f ./Modelfile ### Pull a model ``` -ollama pull llama3 +ollama pull llama3.1 ``` > This command can also be used to update a local model. Only the diff will be pulled. @@ -148,13 +150,13 @@ ollama pull llama3 ### Remove a model ``` -ollama rm llama3 +ollama rm llama3.1 ``` ### Copy a model ``` -ollama cp llama3 my-model +ollama cp llama3.1 my-model ``` ### Multiline input @@ -178,14 +180,14 @@ The image features a yellow smiley face, which is likely the central focus of th ### Pass the prompt as an argument ``` -$ ollama run llama3 "Summarize this file: $(cat README.md)" +$ ollama run llama3.1 "Summarize this file: $(cat README.md)" Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications. ``` ### Show model information ``` -ollama show llama3 +ollama show llama3.1 ``` ### List models on your computer @@ -213,7 +215,7 @@ Next, start the server: Finally, in a separate shell, run a model: ``` -./ollama run llama3 +./ollama run llama3.1 ``` ## REST API @@ -224,7 +226,7 @@ Ollama has a REST API for running and managing models. ``` curl http://localhost:11434/api/generate -d '{ - "model": "llama3", + "model": "llama3.1", "prompt":"Why is the sky blue?" }' ``` @@ -233,7 +235,7 @@ curl http://localhost:11434/api/generate -d '{ ``` curl http://localhost:11434/api/chat -d '{ - "model": "llama3", + "model": "llama3.1", "messages": [ { "role": "user", "content": "why is the sky blue?" } ] @@ -293,6 +295,10 @@ See the [API documentation](./docs/api.md) for all endpoints. - [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS) - [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama) - [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama) +- [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS) +- [AI Studio](https://github.com/MindWorkAI/AI-Studio) +- [Sidellama](https://github.com/gyopak/sidellama) (browser-based LLM client) +- [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows) ### Terminal @@ -384,7 +390,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama) - [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use ollama as a copilot like Github copilot) - [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama) -- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace) +- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and Hugging Face) - [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension) - [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend) - [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support) diff --git a/api/client.go b/api/client.go index fccbc9ad..c59fbc42 100644 --- a/api/client.go +++ b/api/client.go @@ -347,7 +347,16 @@ func (c *Client) Heartbeat(ctx context.Context) error { return nil } -// Embeddings generates embeddings from a model. +// Embed generates embeddings from a model. +func (c *Client) Embed(ctx context.Context, req *EmbedRequest) (*EmbedResponse, error) { + var resp EmbedResponse + if err := c.do(ctx, http.MethodPost, "/api/embed", req, &resp); err != nil { + return nil, err + } + return &resp, nil +} + +// Embeddings generates an embedding from a model. func (c *Client) Embeddings(ctx context.Context, req *EmbeddingRequest) (*EmbeddingResponse, error) { var resp EmbeddingResponse if err := c.do(ctx, http.MethodPost, "/api/embeddings", req, &resp); err != nil { diff --git a/api/types.go b/api/types.go index 87844c67..c2529652 100644 --- a/api/types.go +++ b/api/types.go @@ -47,6 +47,9 @@ type GenerateRequest struct { // Prompt is the textual prompt to send to the model. Prompt string `json:"prompt"` + // Suffix is the text that comes after the inserted text. + Suffix string `json:"suffix"` + // System overrides the model's default system message/prompt. System string `json:"system"` @@ -97,17 +100,85 @@ type ChatRequest struct { // followin the request. KeepAlive *Duration `json:"keep_alive,omitempty"` + // Tools is an optional list of tools the model has access to. + Tools `json:"tools,omitempty"` + // Options lists model-specific options. Options map[string]interface{} `json:"options"` } +type Tools []Tool + +func (t Tools) String() string { + bts, _ := json.Marshal(t) + return string(bts) +} + +func (t Tool) String() string { + bts, _ := json.Marshal(t) + return string(bts) +} + // Message is a single message in a chat sequence. The message contains the // role ("system", "user", or "assistant"), the content and an optional list // of images. type Message struct { - Role string `json:"role"` - Content string `json:"content"` - Images []ImageData `json:"images,omitempty"` + Role string `json:"role"` + Content string `json:"content"` + Images []ImageData `json:"images,omitempty"` + ToolCalls []ToolCall `json:"tool_calls,omitempty"` +} + +func (m *Message) UnmarshalJSON(b []byte) error { + type Alias Message + var a Alias + if err := json.Unmarshal(b, &a); err != nil { + return err + } + + *m = Message(a) + m.Role = strings.ToLower(m.Role) + return nil +} + +type ToolCall struct { + Function ToolCallFunction `json:"function"` +} + +type ToolCallFunction struct { + Name string `json:"name"` + Arguments ToolCallFunctionArguments `json:"arguments"` +} + +type ToolCallFunctionArguments map[string]any + +func (t *ToolCallFunctionArguments) String() string { + bts, _ := json.Marshal(t) + return string(bts) +} + +type Tool struct { + Type string `json:"type"` + Function ToolFunction `json:"function"` +} + +type ToolFunction struct { + Name string `json:"name"` + Description string `json:"description"` + Parameters struct { + Type string `json:"type"` + Required []string `json:"required"` + Properties map[string]struct { + Type string `json:"type"` + Description string `json:"description"` + Enum []string `json:"enum,omitempty"` + } `json:"properties"` + } `json:"parameters"` +} + +func (t *ToolFunction) String() string { + bts, _ := json.Marshal(t) + return string(bts) } // ChatResponse is the response returned by [Client.Chat]. Its fields are @@ -143,6 +214,7 @@ type Options struct { NumPredict int `json:"num_predict,omitempty"` TopK int `json:"top_k,omitempty"` TopP float32 `json:"top_p,omitempty"` + MinP float32 `json:"min_p,omitempty"` TFSZ float32 `json:"tfs_z,omitempty"` TypicalP float32 `json:"typical_p,omitempty"` RepeatLastN int `json:"repeat_last_n,omitempty"` @@ -173,6 +245,34 @@ type Runner struct { NumThread int `json:"num_thread,omitempty"` } +// EmbedRequest is the request passed to [Client.Embed]. +type EmbedRequest struct { + // Model is the model name. + Model string `json:"model"` + + // Input is the input to embed. + Input any `json:"input"` + + // KeepAlive controls how long the model will stay loaded in memory following + // this request. + KeepAlive *Duration `json:"keep_alive,omitempty"` + + Truncate *bool `json:"truncate,omitempty"` + + // Options lists model-specific options. + Options map[string]interface{} `json:"options"` +} + +// EmbedResponse is the response from [Client.Embed]. +type EmbedResponse struct { + Model string `json:"model"` + Embeddings [][]float32 `json:"embeddings"` + + TotalDuration time.Duration `json:"total_duration,omitempty"` + LoadDuration time.Duration `json:"load_duration,omitempty"` + PromptEvalCount int `json:"prompt_eval_count,omitempty"` +} + // EmbeddingRequest is the request passed to [Client.Embeddings]. type EmbeddingRequest struct { // Model is the model name. @@ -219,8 +319,10 @@ type DeleteRequest struct { // ShowRequest is the request passed to [Client.Show]. type ShowRequest struct { - Model string `json:"model"` - System string `json:"system"` + Model string `json:"model"` + System string `json:"system"` + + // Template is deprecated Template string `json:"template"` Verbose bool `json:"verbose"` diff --git a/api/types_test.go b/api/types_test.go index c60ed90e..4699c150 100644 --- a/api/types_test.go +++ b/api/types_test.go @@ -208,3 +208,26 @@ func TestUseMmapFormatParams(t *testing.T) { }) } } + +func TestMessage_UnmarshalJSON(t *testing.T) { + tests := []struct { + input string + expected string + }{ + {`{"role": "USER", "content": "Hello!"}`, "user"}, + {`{"role": "System", "content": "Initialization complete."}`, "system"}, + {`{"role": "assistant", "content": "How can I help you?"}`, "assistant"}, + {`{"role": "TOOl", "content": "Access granted."}`, "tool"}, + } + + for _, test := range tests { + var msg Message + if err := json.Unmarshal([]byte(test.input), &msg); err != nil { + t.Errorf("Unexpected error: %v", err) + } + + if msg.Role != test.expected { + t.Errorf("role not lowercased: got %v, expected %v", msg.Role, test.expected) + } + } +} diff --git a/app/ollama.iss b/app/ollama.iss index 6bedb9ff..dc6178f7 100644 --- a/app/ollama.iss +++ b/app/ollama.iss @@ -138,7 +138,7 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi ;FinishedHeadingLabel=Run your first model -;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n ollama run llama3 +;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n ollama run llama3.1 ;ClickFinish=%n [Registry] diff --git a/app/ollama_welcome.ps1 b/app/ollama_welcome.ps1 index 9af37a46..46777a3a 100644 --- a/app/ollama_welcome.ps1 +++ b/app/ollama_welcome.ps1 @@ -4,5 +4,5 @@ write-host "Welcome to Ollama!" write-host "" write-host "Run your first model:" write-host "" -write-host "`tollama run llama3" +write-host "`tollama run llama3.1" write-host "" \ No newline at end of file diff --git a/cmd/cmd.go b/cmd/cmd.go index c898c7db..610fddcb 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -843,7 +843,6 @@ type runOptions struct { WordWrap bool Format string System string - Template string Images []api.ImageData Options map[string]interface{} MultiModal bool @@ -1037,7 +1036,6 @@ func generate(cmd *cobra.Command, opts runOptions) error { Images: opts.Images, Format: opts.Format, System: opts.System, - Template: opts.Template, Options: opts.Options, KeepAlive: opts.KeepAlive, } @@ -1343,10 +1341,10 @@ func NewCLI() *cobra.Command { envVars["OLLAMA_NUM_PARALLEL"], envVars["OLLAMA_NOPRUNE"], envVars["OLLAMA_ORIGINS"], + envVars["OLLAMA_SCHED_SPREAD"], envVars["OLLAMA_TMPDIR"], envVars["OLLAMA_FLASH_ATTENTION"], envVars["OLLAMA_LLM_LIBRARY"], - envVars["OLLAMA_MAX_VRAM"], }) default: appendEnvDocs(cmd, envs) diff --git a/cmd/interactive.go b/cmd/interactive.go index 9214f2db..70afc6ea 100644 --- a/cmd/interactive.go +++ b/cmd/interactive.go @@ -1,6 +1,7 @@ package cmd import ( + "cmp" "errors" "fmt" "io" @@ -9,13 +10,14 @@ import ( "path/filepath" "regexp" "slices" - "sort" "strings" "github.com/spf13/cobra" + "golang.org/x/exp/maps" "github.com/ollama/ollama/api" "github.com/ollama/ollama/envconfig" + "github.com/ollama/ollama/parser" "github.com/ollama/ollama/progress" "github.com/ollama/ollama/readline" "github.com/ollama/ollama/types/errtypes" @@ -27,7 +29,6 @@ const ( MultilineNone MultilineState = iota MultilinePrompt MultilineSystem - MultilineTemplate ) func loadModel(cmd *cobra.Command, opts *runOptions) error { @@ -94,7 +95,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { fmt.Fprintln(os.Stderr, "Available Commands:") fmt.Fprintln(os.Stderr, " /set parameter ... Set a parameter") fmt.Fprintln(os.Stderr, " /set system Set system message") - fmt.Fprintln(os.Stderr, " /set template Set prompt template") fmt.Fprintln(os.Stderr, " /set history Enable history") fmt.Fprintln(os.Stderr, " /set nohistory Disable history") fmt.Fprintln(os.Stderr, " /set wordwrap Enable wordwrap") @@ -140,6 +140,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { fmt.Fprintln(os.Stderr, " /set parameter num_predict Max number of tokens to predict") fmt.Fprintln(os.Stderr, " /set parameter top_k Pick from top k num of tokens") fmt.Fprintln(os.Stderr, " /set parameter top_p Pick token based on sum of probabilities") + fmt.Fprintln(os.Stderr, " /set parameter min_p Pick token based on top token probability * min_p") fmt.Fprintln(os.Stderr, " /set parameter num_ctx Set the context size") fmt.Fprintln(os.Stderr, " /set parameter temperature Set creativity level") fmt.Fprintln(os.Stderr, " /set parameter repeat_penalty How strongly to penalize repetitions") @@ -204,10 +205,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { opts.Messages = append(opts.Messages, api.Message{Role: "system", Content: opts.System}) fmt.Println("Set system message.") sb.Reset() - case MultilineTemplate: - opts.Template = sb.String() - fmt.Println("Set prompt template.") - sb.Reset() } multiline = MultilineNone @@ -326,17 +323,13 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { } fmt.Printf("Set parameter '%s' to '%s'\n", args[2], strings.Join(params, ", ")) opts.Options[args[2]] = fp[args[2]] - case "system", "template": + case "system": if len(args) < 3 { usageSet() continue } - if args[1] == "system" { - multiline = MultilineSystem - } else if args[1] == "template" { - multiline = MultilineTemplate - } + multiline = MultilineSystem line := strings.Join(args[2:], " ") line, ok := strings.CutPrefix(line, `"""`) @@ -356,23 +349,17 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { continue } - if args[1] == "system" { - opts.System = sb.String() // for display in modelfile - newMessage := api.Message{Role: "system", Content: sb.String()} - // Check if the slice is not empty and the last message is from 'system' - if len(opts.Messages) > 0 && opts.Messages[len(opts.Messages)-1].Role == "system" { - // Replace the last message - opts.Messages[len(opts.Messages)-1] = newMessage - } else { - opts.Messages = append(opts.Messages, newMessage) - } - fmt.Println("Set system message.") - sb.Reset() - } else if args[1] == "template" { - opts.Template = sb.String() - fmt.Println("Set prompt template.") - sb.Reset() + opts.System = sb.String() // for display in modelfile + newMessage := api.Message{Role: "system", Content: sb.String()} + // Check if the slice is not empty and the last message is from 'system' + if len(opts.Messages) > 0 && opts.Messages[len(opts.Messages)-1].Role == "system" { + // Replace the last message + opts.Messages[len(opts.Messages)-1] = newMessage + } else { + opts.Messages = append(opts.Messages, newMessage) } + fmt.Println("Set system message.") + sb.Reset() sb.Reset() continue @@ -391,10 +378,9 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { return err } req := &api.ShowRequest{ - Name: opts.Model, - System: opts.System, - Template: opts.Template, - Options: opts.Options, + Name: opts.Model, + System: opts.System, + Options: opts.Options, } resp, err := client.Show(cmd.Context(), req) if err != nil { @@ -437,12 +423,9 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { fmt.Println("No system message was specified for this model.") } case "template": - switch { - case opts.Template != "": - fmt.Println(opts.Template + "\n") - case resp.Template != "": + if resp.Template != "" { fmt.Println(resp.Template) - default: + } else { fmt.Println("No prompt template was specified for this model.") } default: @@ -526,35 +509,35 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { } func buildModelfile(opts runOptions) string { - var mf strings.Builder - model := opts.ParentModel - if model == "" { - model = opts.Model - } - fmt.Fprintf(&mf, "FROM %s\n", model) + var f parser.File + f.Commands = append(f.Commands, parser.Command{Name: "model", Args: cmp.Or(opts.ParentModel, opts.Model)}) + if opts.System != "" { - fmt.Fprintf(&mf, "SYSTEM \"\"\"%s\"\"\"\n", opts.System) + f.Commands = append(f.Commands, parser.Command{Name: "system", Args: opts.System}) } - if opts.Template != "" { - fmt.Fprintf(&mf, "TEMPLATE \"\"\"%s\"\"\"\n", opts.Template) - } - - keys := make([]string, 0) - for k := range opts.Options { - keys = append(keys, k) - } - sort.Strings(keys) + keys := maps.Keys(opts.Options) + slices.Sort(keys) for _, k := range keys { - fmt.Fprintf(&mf, "PARAMETER %s %v\n", k, opts.Options[k]) + v := opts.Options[k] + var cmds []parser.Command + switch t := v.(type) { + case []string: + for _, s := range t { + cmds = append(cmds, parser.Command{Name: k, Args: s}) + } + default: + cmds = append(cmds, parser.Command{Name: k, Args: fmt.Sprintf("%v", t)}) + } + + f.Commands = append(f.Commands, cmds...) } - fmt.Fprintln(&mf) for _, msg := range opts.Messages { - fmt.Fprintf(&mf, "MESSAGE %s \"\"\"%s\"\"\"\n", msg.Role, msg.Content) + f.Commands = append(f.Commands, parser.Command{Name: "message", Args: fmt.Sprintf("%s: %s", msg.Role, msg.Content)}) } - return mf.String() + return f.String() } func normalizeFilePath(fp string) string { diff --git a/cmd/interactive_test.go b/cmd/interactive_test.go index d9af01eb..bb7e0aba 100644 --- a/cmd/interactive_test.go +++ b/cmd/interactive_test.go @@ -1,12 +1,10 @@ package cmd import ( - "bytes" "testing" - "text/template" + "github.com/google/go-cmp/cmp" "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" "github.com/ollama/ollama/api" ) @@ -57,61 +55,53 @@ d:\path with\spaces\seven.svg inbetween7 c:\users\jdoe\eight.png inbetween8 func TestModelfileBuilder(t *testing.T) { opts := runOptions{ - Model: "hork", - System: "You are part horse and part shark, but all hork. Do horklike things", - Template: "This is a template.", + Model: "hork", + System: "You are part horse and part shark, but all hork. Do horklike things", Messages: []api.Message{ {Role: "user", Content: "Hey there hork!"}, {Role: "assistant", Content: "Yes it is true, I am half horse, half shark."}, }, - Options: map[string]interface{}{}, + Options: map[string]any{ + "temperature": 0.9, + "seed": 42, + "penalize_newline": false, + "stop": []string{"hi", "there"}, + }, } - opts.Options["temperature"] = 0.9 - opts.Options["seed"] = 42 - opts.Options["penalize_newline"] = false - opts.Options["stop"] = []string{"hi", "there"} - - mf := buildModelfile(opts) - expectedModelfile := `FROM {{.Model}} -SYSTEM """{{.System}}""" -TEMPLATE """{{.Template}}""" + t.Run("model", func(t *testing.T) { + expect := `FROM hork +SYSTEM You are part horse and part shark, but all hork. Do horklike things PARAMETER penalize_newline false PARAMETER seed 42 -PARAMETER stop [hi there] +PARAMETER stop hi +PARAMETER stop there PARAMETER temperature 0.9 - -MESSAGE user """Hey there hork!""" -MESSAGE assistant """Yes it is true, I am half horse, half shark.""" +MESSAGE user Hey there hork! +MESSAGE assistant Yes it is true, I am half horse, half shark. ` - tmpl, err := template.New("").Parse(expectedModelfile) - require.NoError(t, err) + actual := buildModelfile(opts) + if diff := cmp.Diff(expect, actual); diff != "" { + t.Errorf("mismatch (-want +got):\n%s", diff) + } + }) - var buf bytes.Buffer - err = tmpl.Execute(&buf, opts) - require.NoError(t, err) - assert.Equal(t, buf.String(), mf) - - opts.ParentModel = "horseshark" - mf = buildModelfile(opts) - expectedModelfile = `FROM {{.ParentModel}} -SYSTEM """{{.System}}""" -TEMPLATE """{{.Template}}""" + t.Run("parent model", func(t *testing.T) { + opts.ParentModel = "horseshark" + expect := `FROM horseshark +SYSTEM You are part horse and part shark, but all hork. Do horklike things PARAMETER penalize_newline false PARAMETER seed 42 -PARAMETER stop [hi there] +PARAMETER stop hi +PARAMETER stop there PARAMETER temperature 0.9 - -MESSAGE user """Hey there hork!""" -MESSAGE assistant """Yes it is true, I am half horse, half shark.""" +MESSAGE user Hey there hork! +MESSAGE assistant Yes it is true, I am half horse, half shark. ` - - tmpl, err = template.New("").Parse(expectedModelfile) - require.NoError(t, err) - - var parentBuf bytes.Buffer - err = tmpl.Execute(&parentBuf, opts) - require.NoError(t, err) - assert.Equal(t, parentBuf.String(), mf) + actual := buildModelfile(opts) + if diff := cmp.Diff(expect, actual); diff != "" { + t.Errorf("mismatch (-want +got):\n%s", diff) + } + }) } diff --git a/convert/mistral.go b/convert/mistral.go index da6874cf..8fe066d6 100644 --- a/convert/mistral.go +++ b/convert/mistral.go @@ -71,6 +71,11 @@ func (m *MistralModel) WriteGGUF(ws io.WriteSeeker) error { "tokenizer.ggml.unknown_token_id": uint32(0), } + if m.Params.HeadDimension > 0 { + kv["llama.attention.key_length"] = uint32(m.Params.HeadDimension) + kv["llama.attention.value_length"] = uint32(m.Params.HeadDimension) + } + return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors) } diff --git a/docs/api.md b/docs/api.md index c577bb1a..c0202ef3 100644 --- a/docs/api.md +++ b/docs/api.md @@ -40,6 +40,7 @@ Generate a response for a given prompt with a provided model. This is a streamin - `model`: (required) the [model name](#model-names) - `prompt`: the prompt to generate a response for +- `suffix`: the text after the model response - `images`: (optional) a list of base64-encoded images (for multimodal models such as `llava`) Advanced parameters (optional): @@ -57,7 +58,8 @@ Advanced parameters (optional): Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#request-json-mode) below. -> Note: it's important to instruct the model to use JSON in the `prompt`. Otherwise, the model may generate large amounts whitespace. +> [!IMPORTANT] +> It's important to instruct the model to use JSON in the `prompt`. Otherwise, the model may generate large amounts whitespace. ### Examples @@ -148,8 +150,44 @@ If `stream` is set to `false`, the response will be a single JSON object: } ``` +#### Request (with suffix) + +##### Request + +```shell +curl http://localhost:11434/api/generate -d '{ + "model": "codellama:code", + "prompt": "def compute_gcd(a, b):", + "suffix": " return result", + "options": { + "temperature": 0 + }, + "stream": false +}' +``` + +##### Response + +```json +{ + "model": "codellama:code", + "created_at": "2024-07-22T20:47:51.147561Z", + "response": "\n if a == 0:\n return b\n else:\n return compute_gcd(b % a, a)\n\ndef compute_lcm(a, b):\n result = (a * b) / compute_gcd(a, b)\n", + "done": true, + "done_reason": "stop", + "context": [...], + "total_duration": 1162761250, + "load_duration": 6683708, + "prompt_eval_count": 17, + "prompt_eval_duration": 201222000, + "eval_count": 63, + "eval_duration": 953997000 +} +``` + #### Request (JSON mode) +> [!IMPORTANT] > When `format` is set to `json`, the output will always be a well-formed JSON object. It's important to also instruct the model to respond in JSON. ##### Request @@ -298,6 +336,7 @@ curl http://localhost:11434/api/generate -d '{ "num_predict": 100, "top_k": 20, "top_p": 0.9, + "min_p": 0.0, "tfs_z": 0.5, "typical_p": 0.7, "repeat_last_n": 33, @@ -380,12 +419,14 @@ Generate the next message in a chat with a provided model. This is a streaming e - `model`: (required) the [model name](#model-names) - `messages`: the messages of the chat, this can be used to keep a chat memory +- `tools`: tools for the model to use if supported. Requires `stream` to be set to `false` The `message` object has the following fields: -- `role`: the role of the message, either `system`, `user` or `assistant` +- `role`: the role of the message, either `system`, `user`, `assistant`, or `tool` - `content`: the content of the message - `images` (optional): a list of images to include in the message (for multimodal models such as `llava`) +- `tool_calls` (optional): a list of tools the model wants to use Advanced parameters (optional): @@ -546,7 +587,7 @@ Final response: ##### Request -Send a chat message with a conversation history. +Send a chat message with images. The images should be provided as an array, with the individual images encoded in Base64. ```shell curl http://localhost:11434/api/chat -d '{ @@ -622,6 +663,79 @@ curl http://localhost:11434/api/chat -d '{ } ``` +#### Chat request (with tools) + +##### Request + +``` +curl http://localhost:11434/api/chat -d '{ + "model": "mistral", + "messages": [ + { + "role": "user", + "content": "What is the weather today in Paris?" + } + ], + "stream": false, + "tools": [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The location to get the weather for, e.g. San Francisco, CA" + }, + "format": { + "type": "string", + "description": "The format to return the weather in, e.g. 'celsius' or 'fahrenheit'", + "enum": ["celsius", "fahrenheit"] + } + }, + "required": ["location", "format"] + } + } + } + ] +}' +``` + +##### Response + +```json +{ + "model": "mistral:7b-instruct-v0.3-q4_K_M", + "created_at": "2024-07-22T20:33:28.123648Z", + "message": { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "function": { + "name": "get_current_weather", + "arguments": { + "format": "celsius", + "location": "Paris, FR" + } + } + } + ] + }, + "done_reason": "stop", + "done": true, + "total_duration": 885095291, + "load_duration": 3753500, + "prompt_eval_count": 122, + "prompt_eval_duration": 328493000, + "eval_count": 33, + "eval_duration": 552222000 +} +``` + ## Create a Model ```shell @@ -1026,7 +1140,7 @@ If `stream` is set to `false`, then the response is a single JSON object: ## Generate Embeddings ```shell -POST /api/embeddings +POST /api/embed ``` Generate embeddings from a model @@ -1034,10 +1148,11 @@ Generate embeddings from a model ### Parameters - `model`: name of model to generate embeddings from -- `prompt`: text to generate embeddings for +- `input`: text or list of text to generate embeddings for Advanced parameters: +- `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true` - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature` - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`) @@ -1046,9 +1161,9 @@ Advanced parameters: #### Request ```shell -curl http://localhost:11434/api/embeddings -d '{ +curl http://localhost:11434/api/embed -d '{ "model": "all-minilm", - "prompt": "Here is an article about llamas..." + "input": "Why is the sky blue?" }' ``` @@ -1056,10 +1171,35 @@ curl http://localhost:11434/api/embeddings -d '{ ```json { - "embedding": [ - 0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313, - 0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281 - ] + "model": "all-minilm", + "embeddings": [[ + 0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814, + 0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348 + ]] +} +``` + +#### Request (Multiple input) + +```shell +curl http://localhost:11434/api/embed -d '{ + "model": "all-minilm", + "input": ["Why is the sky blue?", "Why is the grass green?"] +}' +``` + +#### Response + +```json +{ + "model": "all-minilm", + "embeddings": [[ + 0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814, + 0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348 + ],[ + -0.0098027075, 0.06042469, 0.025257962, -0.006364387, 0.07272725, + 0.017194884, 0.09032035, -0.051705178, 0.09951512, 0.09072481 + ]] } ``` @@ -1106,3 +1246,45 @@ A single JSON object will be returned. ] } ``` + +## Generate Embedding + +> Note: this endpoint has been superseded by `/api/embed` + +```shell +POST /api/embeddings +``` + +Generate embeddings from a model + +### Parameters + +- `model`: name of model to generate embeddings from +- `prompt`: text to generate embeddings for + +Advanced parameters: + +- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature` +- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`) + +### Examples + +#### Request + +```shell +curl http://localhost:11434/api/embeddings -d '{ + "model": "all-minilm", + "prompt": "Here is an article about llamas..." +}' +``` + +#### Response + +```json +{ + "embedding": [ + 0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313, + 0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281 + ] +} +``` diff --git a/docs/docker.md b/docs/docker.md index 0b58562b..a34c3291 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -63,7 +63,7 @@ docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 114 Now you can run a model: ``` -docker exec -it ollama ollama run llama3 +docker exec -it ollama ollama run llama3.1 ``` ### Try different models diff --git a/docs/faq.md b/docs/faq.md index e48507d3..494367cc 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -221,7 +221,7 @@ curl http://localhost:11434/api/chat -d '{"model": "mistral"}' To preload a model using the CLI, use the command: ```shell -ollama run llama3 "" +ollama run llama3.1 "" ``` ## How do I keep a model loaded in memory or make it unload immediately? @@ -267,3 +267,7 @@ The following server settings may be used to adjust how Ollama handles concurren - `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512 Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting. Once ROCm v6.2 is available, Windows Radeon will follow the defaults above. You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM. + +## How does Ollama load models on multiple GPUs? + +Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models. When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available. If the model will entirely fit on any single GPU, Ollama will load the model on that GPU. This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference. If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs. diff --git a/docs/gpu.md b/docs/gpu.md index 80f276c3..e669ea32 100644 --- a/docs/gpu.md +++ b/docs/gpu.md @@ -46,13 +46,24 @@ sudo modprobe nvidia_uvm` ## AMD Radeon Ollama supports the following AMD GPUs: + +### Linux Support | Family | Cards and accelerators | | -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | | AMD Radeon RX | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` `Vega 64` `Vega 56` | | AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` `V420` `V340` `V320` `Vega II Duo` `Vega II` `VII` `SSG` | | AMD Instinct | `MI300X` `MI300A` `MI300` `MI250X` `MI250` `MI210` `MI200` `MI100` `MI60` `MI50` | -### Overrides +### Windows Support +With ROCm v6.1, the following GPUs are supported on Windows. + +| Family | Cards and accelerators | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| AMD Radeon RX | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` | +| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` | + + +### Overrides on Linux Ollama leverages the AMD ROCm library, which does not support all AMD GPUs. In some cases you can force the system to try to use a similar LLVM target that is close. For example The Radeon RX 5400 is `gfx1034` (also known as 10.3.4) @@ -63,7 +74,7 @@ would set `HSA_OVERRIDE_GFX_VERSION="10.3.0"` as an environment variable for the server. If you have an unsupported AMD GPU you can experiment using the list of supported types below. -At this time, the known supported GPU types are the following LLVM Targets. +At this time, the known supported GPU types on linux are the following LLVM Targets. This table shows some example GPUs that map to these LLVM targets: | **LLVM Target** | **An Example GPU** | |-----------------|---------------------| diff --git a/docs/modelfile.md b/docs/modelfile.md index 21ee1826..852bf96c 100644 --- a/docs/modelfile.md +++ b/docs/modelfile.md @@ -1,6 +1,7 @@ # Ollama Model File -> Note: `Modelfile` syntax is in development +> [!NOTE] +> `Modelfile` syntax is in development A model file is the blueprint to create and share models with Ollama. @@ -140,6 +141,7 @@ PARAMETER | num_predict | Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context) | int | num_predict 42 | | top_k | Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40) | int | top_k 40 | | top_p | Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9) | float | top_p 0.9 | +| min_p | Alternative to the top_p, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out. (Default: 0.0) | float | min_p 0.05 | ### TEMPLATE diff --git a/docs/openai.md b/docs/openai.md index 9dda05c3..fee30f71 100644 --- a/docs/openai.md +++ b/docs/openai.md @@ -78,8 +78,8 @@ curl http://localhost:11434/v1/chat/completions \ - [x] Streaming - [x] JSON mode - [x] Reproducible outputs +- [x] Tools (streaming support coming soon) - [ ] Vision -- [ ] Function calling - [ ] Logprobs #### Supported request fields @@ -97,16 +97,12 @@ curl http://localhost:11434/v1/chat/completions \ - [x] `temperature` - [x] `top_p` - [x] `max_tokens` -- [ ] `logit_bias` -- [ ] `tools` +- [x] `tools` - [ ] `tool_choice` +- [ ] `logit_bias` - [ ] `user` - [ ] `n` -#### Notes - -- `usage.prompt_tokens` will be 0 for completions where prompt evaluation is cached - ## Models Before using a model, pull it locally `ollama pull`: diff --git a/docs/template.md b/docs/template.md new file mode 100644 index 00000000..f6ce06ba --- /dev/null +++ b/docs/template.md @@ -0,0 +1,173 @@ +# Template + +Ollama provides a powerful templating engine backed by Go's built-in templating engine to construct prompts for your large language model. This feature is a valuable tool to get the most out of your models. + +## Basic Template Structure + +A basic Go template consists of three main parts: + +* **Layout**: The overall structure of the template. +* **Variables**: Placeholders for dynamic data that will be replaced with actual values when the template is rendered. +* **Functions**: Custom functions or logic that can be used to manipulate the template's content. + +Here's an example of a simple chat template: + +```gotmpl +{{- range .Messages }} +{{ .Role }}: {{ .Content }} +{{- end }} +``` + +In this example, we have: + +* A basic messages structure (layout) +* Three variables: `Messages`, `Role`, and `Content` (variables) +* A custom function (action) that iterates over an array of items (`range .Messages`) and displays each item + +## Adding templates to your model + +By default, models imported into Ollama have a default template of `{{ .Prompt }}`, i.e. user inputs are sent verbatim to the LLM. This is appropriate for text or code completion models but lacks essential markers for chat or instruction models. + +Omitting a template in these models puts the responsibility of correctly templating input onto the user. Adding a template allows users to easily get the best results from the model. + +To add templates in your model, you'll need to add a `TEMPLATE` command to the Modelfile. Here's an example using Meta's Llama 3. + +```dockerfile +FROM llama3 + +TEMPLATE """{{- if .System }}<|start_header_id|>system<|end_header_id|> + +{{ .System }}<|eot_id|> +{{- end }} +{{- range .Messages }}<|start_header_id|>{{ .Role }}<|end_header_id|> + +{{ .Content }}<|eot_id|> +{{- end }}<|start_header_id|>assistant<|end_header_id|> + +""" +``` + +## Variables + +`System` (string): system prompt + +`Prompt` (string): user prompt + +`Response` (string): assistant response + +`Suffix` (string): text inserted after the assistant's response + +`Messages` (list): list of messages + +`Messages[].Role` (string): role which can be one of `system`, `user`, `assistant`, or `tool` + +`Messages[].Content` (string): message content + +`Messages[].ToolCalls` (list): list of tools the model wants to call + +`Messages[].ToolCalls[].Function` (object): function to call + +`Messages[].ToolCalls[].Function.Name` (string): function name + +`Messages[].ToolCalls[].Function.Arguments` (map): mapping of argument name to argument value + +`Tools` (list): list of tools the model can access + +`Tools[].Type` (string): schema type. `type` is always `function` + +`Tools[].Function` (object): function definition + +`Tools[].Function.Name` (string): function name + +`Tools[].Function.Description` (string): function description + +`Tools[].Function.Parameters` (object): function parameters + +`Tools[].Function.Parameters.Type` (string): schema type. `type` is always `object` + +`Tools[].Function.Parameters.Required` (list): list of required properties + +`Tools[].Function.Parameters.Properties` (map): mapping of property name to property definition + +`Tools[].Function.Parameters.Properties[].Type` (string): property type + +`Tools[].Function.Parameters.Properties[].Description` (string): property description + +`Tools[].Function.Parameters.Properties[].Enum` (list): list of valid values + +## Tips and Best Practices + +Keep the following tips and best practices in mind when working with Go templates: + +* **Be mindful of dot**: Control flow structures like `range` and `with` changes the value `.` +* **Out-of-scope variables**: Use `$.` to reference variables not currently in scope, starting from the root +* **Whitespace control**: Use `-` to trim leading (`{{-`) and trailing (`-}}`) whitespace + +## Examples + +### Example Messages + +#### ChatML + +ChatML is a popular template format. It can be used for models such as Databrick's DBRX, Intel's Neural Chat, and Microsoft's Orca 2. + +```gotmpl +{{- if .System }}<|im_start|>system +{{ .System }}<|im_end|> +{{ end }} +{{- range .Messages }}<|im_start|>{{ .Role }} +{{ .Content }}<|im_end|> +{{ end }}<|im_start|>assistant +{{ else }} +{{ if .System }}<|im_start|>system +{{ .System }}<|im_end|> +``` + +### Example Tools + +Tools support can be added to a model by adding a `{{ .Tools }}` node to the template. This feature is useful for models trained to call external tools and can a powerful tool for retrieving real-time data or performing complex tasks. + +#### Mistral + +Mistral v0.3 and Mixtral 8x22B supports tool calling. + +```gotmpl +{{- range $index, $_ := .Messages }} +{{- if eq .Role "user" }} +{{- if and (le (len (slice $.Messages $index)) 2) $.Tools }}[AVAILABLE_TOOLS] {{ json $.Tools }}[/AVAILABLE_TOOLS] +{{- end }}[INST] {{ if and (eq (len (slice $.Messages $index)) 1) $.System }}{{ $.System }} + +{{ end }}{{ .Content }}[/INST] +{{- else if eq .Role "assistant" }} +{{- if .Content }} {{ .Content }} +{{- else if .ToolCalls }}[TOOL_CALLS] [ +{{- range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ json .Function.Arguments }}} +{{- end }}] +{{- end }} +{{- else if eq .Role "tool" }}[TOOL_RESULTS] {"content": {{ .Content }}}[/TOOL_RESULTS] +{{- end }} +{{- end }} +``` + +### Example Fill-in-Middle + +Fill-in-middle support can be added to a model by adding a `{{ .Suffix }}` node to the template. This feature is useful for models that are trained to generate text in the middle of user input, such as code completion models. + +#### CodeLlama + +CodeLlama [7B](https://ollama.com/library/codellama:7b-code) and [13B](https://ollama.com/library/codellama:13b-code) code completion models support fill-in-middle. + +```gotmpl +
 {{ .Prompt }} {{ .Suffix }} 
+```
+
+> [!NOTE]
+> CodeLlama 34B and 70B code completion and all instruct and Python fine-tuned models do not support fill-in-middle.
+
+#### Codestral
+
+Codestral [22B](https://ollama.com/library/codestral:22b) supports fill-in-middle.
+
+```gotmpl
+[SUFFIX]{{ .Suffix }}[PREFIX] {{ .Prompt }}
+```
diff --git a/docs/tutorials/langchainjs.md b/docs/tutorials/langchainjs.md
index 4d60afb6..f925869b 100644
--- a/docs/tutorials/langchainjs.md
+++ b/docs/tutorials/langchainjs.md
@@ -15,7 +15,7 @@ import { Ollama } from "@langchain/community/llms/ollama";
 
 const ollama = new Ollama({
   baseUrl: "http://localhost:11434",
-  model: "llama3",
+  model: "llama3.1",
 });
 
 const answer = await ollama.invoke(`why is the sky blue?`);
@@ -23,7 +23,7 @@ const answer = await ollama.invoke(`why is the sky blue?`);
 console.log(answer);
 ```
 
-That will get us the same thing as if we ran `ollama run llama3 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
+That will get us the same thing as if we ran `ollama run llama3.1 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
 
 ```bash
 npm install cheerio
diff --git a/docs/windows.md b/docs/windows.md
index 69c2aa6d..dbfc1440 100644
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -23,6 +23,8 @@ Logs will often be helpful in diagnosing the problem (see
 * NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
 * AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card
 
+Ollama uses unicode characters for progress indication, which may render as unknown squares in some older terminal fonts in Windows 10. If you see this, try changing your terminal font settings.
+
 ## API Access
 
 Here's a quick example showing API access from `powershell`
diff --git a/envconfig/config.go b/envconfig/config.go
index 62d661eb..0abc6968 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -43,8 +43,6 @@ var (
 	MaxRunners int
 	// Set via OLLAMA_MAX_QUEUE in the environment
 	MaxQueuedRequests int
-	// Set via OLLAMA_MAX_VRAM in the environment
-	MaxVRAM uint64
 	// Set via OLLAMA_MODELS in the environment
 	ModelsDir string
 	// Set via OLLAMA_NOHISTORY in the environment
@@ -89,7 +87,6 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
 		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
 		"OLLAMA_MAX_QUEUE":         {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
-		"OLLAMA_MAX_VRAM":          {"OLLAMA_MAX_VRAM", MaxVRAM, "Maximum VRAM"},
 		"OLLAMA_MODELS":            {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"},
 		"OLLAMA_NOHISTORY":         {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
 		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
@@ -194,16 +191,6 @@ func LoadConfig() {
 
 	TmpDir = clean("OLLAMA_TMPDIR")
 
-	userLimit := clean("OLLAMA_MAX_VRAM")
-	if userLimit != "" {
-		avail, err := strconv.ParseUint(userLimit, 10, 64)
-		if err != nil {
-			slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
-		} else {
-			MaxVRAM = avail
-		}
-	}
-
 	LLMLibrary = clean("OLLAMA_LLM_LIBRARY")
 
 	if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" {
diff --git a/examples/go-chat/main.go b/examples/go-chat/main.go
index 5266f03e..7663fb8f 100644
--- a/examples/go-chat/main.go
+++ b/examples/go-chat/main.go
@@ -35,7 +35,7 @@ func main() {
 
 	ctx := context.Background()
 	req := &api.ChatRequest{
-		Model:    "llama3",
+		Model:    "llama3.1",
 		Messages: messages,
 	}
 
diff --git a/examples/go-generate-streaming/main.go b/examples/go-generate-streaming/main.go
index 49403351..3acfb22a 100644
--- a/examples/go-generate-streaming/main.go
+++ b/examples/go-generate-streaming/main.go
@@ -16,7 +16,7 @@ func main() {
 
 	// By default, GenerateRequest is streaming.
 	req := &api.GenerateRequest{
-		Model:  "gemma",
+		Model:  "gemma2",
 		Prompt: "how many planets are there?",
 	}
 
diff --git a/examples/go-generate/main.go b/examples/go-generate/main.go
index 50fbf64b..2fe28742 100644
--- a/examples/go-generate/main.go
+++ b/examples/go-generate/main.go
@@ -15,7 +15,7 @@ func main() {
 	}
 
 	req := &api.GenerateRequest{
-		Model:  "gemma",
+		Model:  "gemma2",
 		Prompt: "how many planets are there?",
 
 		// set streaming to false
diff --git a/examples/go-http-generate/README.md b/examples/go-http-generate/README.md
deleted file mode 100644
index e69de29b..00000000
diff --git a/examples/langchain-python-rag-document/README.md b/examples/langchain-python-rag-document/README.md
index 20a73a88..e2f3bc02 100644
--- a/examples/langchain-python-rag-document/README.md
+++ b/examples/langchain-python-rag-document/README.md
@@ -4,6 +4,14 @@ This example provides an interface for asking questions to a PDF document.
 
 ## Setup
 
+1. Ensure you have the `llama3.1` model installed:
+
+```
+ollama pull llama3.1
+```
+
+2. Install the Python Requirements.
+
 ```
 pip install -r requirements.txt
 ```
diff --git a/examples/langchain-python-rag-document/main.py b/examples/langchain-python-rag-document/main.py
index 3ed9499f..6f7cec9b 100644
--- a/examples/langchain-python-rag-document/main.py
+++ b/examples/langchain-python-rag-document/main.py
@@ -51,7 +51,7 @@ while True:
         template=template,
     )
 
-    llm = Ollama(model="llama3:8b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
+    llm = Ollama(model="llama3.1", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
     qa_chain = RetrievalQA.from_chain_type(
         llm,
         retriever=vectorstore.as_retriever(),
diff --git a/examples/langchain-python-rag-websummary/README.md b/examples/langchain-python-rag-websummary/README.md
index 3f3b9873..29c706a3 100644
--- a/examples/langchain-python-rag-websummary/README.md
+++ b/examples/langchain-python-rag-websummary/README.md
@@ -4,10 +4,10 @@ This example summarizes the website, [https://ollama.com/blog/run-llama2-uncenso
 
 ## Running the Example
 
-1. Ensure you have the `llama2` model installed:
+1. Ensure you have the `llama3.1` model installed:
 
    ```bash
-   ollama pull llama2
+   ollama pull llama3.1
    ```
 
 2. Install the Python Requirements.
diff --git a/examples/langchain-python-rag-websummary/main.py b/examples/langchain-python-rag-websummary/main.py
index d1b05ba8..77b09fbb 100644
--- a/examples/langchain-python-rag-websummary/main.py
+++ b/examples/langchain-python-rag-websummary/main.py
@@ -5,8 +5,8 @@ from langchain.chains.summarize import load_summarize_chain
 loader = WebBaseLoader("https://ollama.com/blog/run-llama2-uncensored-locally")
 docs = loader.load()
 
-llm = Ollama(model="llama3")
+llm = Ollama(model="llama3.1")
 chain = load_summarize_chain(llm, chain_type="stuff")
 
-result = chain.invoke(docs) 
+result = chain.invoke(docs)
 print(result)
diff --git a/examples/langchain-python-simple/README.md b/examples/langchain-python-simple/README.md
index d4102dec..60db2c8c 100644
--- a/examples/langchain-python-simple/README.md
+++ b/examples/langchain-python-simple/README.md
@@ -4,10 +4,10 @@ This example is a basic "hello world" of using LangChain with Ollama.
 
 ## Running the Example
 
-1. Ensure you have the `llama3` model installed:
+1. Ensure you have the `llama3.1` model installed:
 
    ```bash
-   ollama pull llama3
+   ollama pull llama3.1
    ```
 
 2. Install the Python Requirements.
diff --git a/examples/langchain-python-simple/main.py b/examples/langchain-python-simple/main.py
index 7cb65286..a7ed81d6 100644
--- a/examples/langchain-python-simple/main.py
+++ b/examples/langchain-python-simple/main.py
@@ -1,6 +1,6 @@
 from langchain.llms import Ollama
 
 input = input("What is your question?")
-llm = Ollama(model="llama3")
+llm = Ollama(model="llama3.1")
 res = llm.predict(input)
 print (res)
diff --git a/examples/modelfile-mario/Modelfile b/examples/modelfile-mario/Modelfile
index 33d5952b..a3747086 100644
--- a/examples/modelfile-mario/Modelfile
+++ b/examples/modelfile-mario/Modelfile
@@ -1,4 +1,4 @@
-FROM llama3
+FROM llama3.1
 PARAMETER temperature 1
 SYSTEM """
 You are Mario from super mario bros, acting as an assistant.
diff --git a/examples/modelfile-mario/readme.md b/examples/modelfile-mario/readme.md
index e4f0d417..c3f34197 100644
--- a/examples/modelfile-mario/readme.md
+++ b/examples/modelfile-mario/readme.md
@@ -2,12 +2,12 @@
 
 # Example character: Mario
 
-This example shows how to create a basic character using Llama3 as the base model.
+This example shows how to create a basic character using Llama3.1 as the base model.
 
 To run this example:
 
 1. Download the Modelfile
-2. `ollama pull llama3` to get the base model used in the model file.
+2. `ollama pull llama3.1` to get the base model used in the model file.
 3. `ollama create NAME -f ./Modelfile`
 4. `ollama run NAME`
 
@@ -18,7 +18,7 @@ Ask it some questions like "Who are you?" or "Is Peach in trouble again?"
 What the model file looks like:
 
 ```
-FROM llama3
+FROM llama3.1
 PARAMETER temperature 1
 SYSTEM """
 You are Mario from Super Mario Bros, acting as an assistant.
diff --git a/examples/python-dockerit/dockerit.py b/examples/python-dockerit/dockerit.py
index b013102f..6a288d90 100644
--- a/examples/python-dockerit/dockerit.py
+++ b/examples/python-dockerit/dockerit.py
@@ -4,7 +4,7 @@ imageName = input("Enter the name of the image: ")
 client = docker.from_env()
 s = requests.Session()
 output=""
-with s.post('http://localhost:11434/api/generate', json={'model': 'dockerit', 'prompt': inputDescription}, stream=True) as r:
+with s.post('http://localhost:11434/api/generate', json={'model': 'mattw/dockerit', 'prompt': inputDescription}, stream=True) as r:
   for line in r.iter_lines():
     if line:
       j = json.loads(line)
diff --git a/examples/python-json-datagenerator/predefinedschema.py b/examples/python-json-datagenerator/predefinedschema.py
index 1fd54892..68090ad7 100644
--- a/examples/python-json-datagenerator/predefinedschema.py
+++ b/examples/python-json-datagenerator/predefinedschema.py
@@ -2,7 +2,7 @@ import requests
 import json
 import random
 
-model = "llama3"
+model = "llama3.1"
 template = {
   "firstName": "",
   "lastName": "",
diff --git a/examples/python-json-datagenerator/randomaddresses.py b/examples/python-json-datagenerator/randomaddresses.py
index 72b1fefb..878c9803 100644
--- a/examples/python-json-datagenerator/randomaddresses.py
+++ b/examples/python-json-datagenerator/randomaddresses.py
@@ -12,7 +12,7 @@ countries = [
     "France",
 ]
 country = random.choice(countries)
-model = "llama3"
+model = "llama3.1"
 
 prompt = f"generate one realistically believable sample data set of a persons first name, last name, address in {country}, and phone number. Do not use common names. Respond using JSON. Key names should have no backslashes, values should use plain ascii with no special characters."
 
diff --git a/examples/python-json-datagenerator/readme.md b/examples/python-json-datagenerator/readme.md
index 88357044..5b444dff 100644
--- a/examples/python-json-datagenerator/readme.md
+++ b/examples/python-json-datagenerator/readme.md
@@ -6,10 +6,10 @@ There are two python scripts in this example. `randomaddresses.py` generates ran
 
 ## Running the Example
 
-1. Ensure you have the `llama3` model installed:
+1. Ensure you have the `llama3.1` model installed:
 
    ```bash
-   ollama pull llama3
+   ollama pull llama3.1
    ```
 
 2. Install the Python Requirements.
diff --git a/examples/python-simplechat/client.py b/examples/python-simplechat/client.py
index f82a16b3..85043d5f 100644
--- a/examples/python-simplechat/client.py
+++ b/examples/python-simplechat/client.py
@@ -2,7 +2,7 @@ import json
 import requests
 
 # NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
-model = "llama3"  # TODO: update this for whatever model you wish to use
+model = "llama3.1"  # TODO: update this for whatever model you wish to use
 
 
 def chat(messages):
diff --git a/examples/python-simplechat/readme.md b/examples/python-simplechat/readme.md
index dd2576bc..4c2ded4d 100644
--- a/examples/python-simplechat/readme.md
+++ b/examples/python-simplechat/readme.md
@@ -4,10 +4,10 @@ The **chat** endpoint is one of two ways to generate text from an LLM with Ollam
 
 ## Running the Example
 
-1. Ensure you have the `llama3` model installed:
+1. Ensure you have the `llama3.1` model installed:
 
    ```bash
-   ollama pull llama3
+   ollama pull llama3.1
    ```
 
 2. Install the Python Requirements.
diff --git a/examples/typescript-simplechat/client.ts b/examples/typescript-simplechat/client.ts
index a1e0eea3..8ad113b1 100644
--- a/examples/typescript-simplechat/client.ts
+++ b/examples/typescript-simplechat/client.ts
@@ -1,6 +1,6 @@
 import * as readline from "readline";
 
-const model = "llama3";
+const model = "llama3.1";
 type Message = {
   role: "assistant" | "user" | "system";
   content: string;
diff --git a/gpu/amd_hip_windows.go b/gpu/amd_hip_windows.go
index 2586278c..98806234 100644
--- a/gpu/amd_hip_windows.go
+++ b/gpu/amd_hip_windows.go
@@ -33,9 +33,10 @@ type HipLib struct {
 }
 
 func NewHipLib() (*HipLib, error) {
-	h, err := windows.LoadLibrary("amdhip64.dll")
+	// At runtime we depend on v6, so discover GPUs with the same library for a consistent set of GPUs
+	h, err := windows.LoadLibrary("amdhip64_6.dll")
 	if err != nil {
-		return nil, fmt.Errorf("unable to load amdhip64.dll: %w", err)
+		return nil, fmt.Errorf("unable to load amdhip64_6.dll, please make sure to upgrade to the latest amd driver: %w", err)
 	}
 	hl := &HipLib{}
 	hl.dll = h
diff --git a/gpu/amd_linux.go b/gpu/amd_linux.go
index 15b6fc61..6493af9e 100644
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -10,6 +10,7 @@ import (
 	"path/filepath"
 	"regexp"
 	"slices"
+	"sort"
 	"strconv"
 	"strings"
 
@@ -82,6 +83,20 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 	// The amdgpu driver always exposes the host CPU(s) first, but we have to skip them and subtract
 	// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
 	matches, _ := filepath.Glob(GPUPropertiesFileGlob)
+	sort.Slice(matches, func(i, j int) bool {
+		// /sys/class/kfd/kfd/topology/nodes//properties
+		a, err := strconv.ParseInt(filepath.Base(filepath.Dir(matches[i])), 10, 64)
+		if err != nil {
+			slog.Debug("parse err", "error", err, "match", matches[i])
+			return false
+		}
+		b, err := strconv.ParseInt(filepath.Base(filepath.Dir(matches[j])), 10, 64)
+		if err != nil {
+			slog.Debug("parse err", "error", err, "match", matches[i])
+			return false
+		}
+		return a < b
+	})
 	cpuCount := 0
 	for _, match := range matches {
 		slog.Debug("evaluating amdgpu node " + match)
diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go
index 425259d7..20aed447 100644
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -92,7 +92,8 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 			continue
 		}
 		if gfxOverride == "" {
-			if !slices.Contains[[]string, string](supported, gfx) {
+			// Strip off Target Features when comparing
+			if !slices.Contains[[]string, string](supported, strings.Split(gfx, ":")[0]) {
 				slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
 				// TODO - consider discrete markdown just for ROCM troubleshooting?
 				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
diff --git a/integration/concurrency_test.go b/integration/concurrency_test.go
index d66ba9f0..8593285b 100644
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -69,7 +69,7 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
 	reqLimit := len(req)
 	iterLimit := 5
 
-	vram := os.Getenv("OLLAMA_MAX_VRAM")
+	vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
 	if vram != "" {
 		max, err := strconv.ParseUint(vram, 10, 64)
 		require.NoError(t, err)
@@ -106,7 +106,7 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
 
 // Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
 func TestMultiModelStress(t *testing.T) {
-	vram := os.Getenv("OLLAMA_MAX_VRAM")
+	vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
 	if vram == "" {
 		t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
 	}
diff --git a/integration/context_test.go b/integration/context_test.go
index 46fac5ea..f1342e16 100644
--- a/integration/context_test.go
+++ b/integration/context_test.go
@@ -12,7 +12,7 @@ import (
 
 func TestContextExhaustion(t *testing.T) {
 	// Longer needed for small footprint GPUs
-	ctx, cancel := context.WithTimeout(context.Background(), 6*time.Minute)
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
 	defer cancel()
 	// Set up the test data
 	req := api.GenerateRequest{
@@ -25,5 +25,10 @@ func TestContextExhaustion(t *testing.T) {
 			"num_ctx":     128,
 		},
 	}
-	GenerateTestHelper(ctx, t, req, []string{"once", "upon", "lived"})
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+	if err := PullIfMissing(ctx, client, req.Model); err != nil {
+		t.Fatalf("PullIfMissing failed: %v", err)
+	}
+	DoGenerate(ctx, t, client, req, []string{"once", "upon", "lived"}, 120*time.Second, 10*time.Second)
 }
diff --git a/integration/embed_test.go b/integration/embed_test.go
new file mode 100644
index 00000000..10333d5d
--- /dev/null
+++ b/integration/embed_test.go
@@ -0,0 +1,209 @@
+//go:build integration
+
+package integration
+
+import (
+	"context"
+	"math"
+	"testing"
+	"time"
+
+	"github.com/ollama/ollama/api"
+)
+
+func floatsEqual32(a, b float32) bool {
+	return math.Abs(float64(a-b)) <= 1e-4
+}
+
+func floatsEqual64(a, b float64) bool {
+	return math.Abs(a-b) <= 1e-4
+}
+
+func TestAllMiniLMEmbeddings(t *testing.T) {
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	defer cancel()
+
+	req := api.EmbeddingRequest{
+		Model:  "all-minilm",
+		Prompt: "why is the sky blue?",
+	}
+
+	res, err := embeddingTestHelper(ctx, t, req)
+
+	if err != nil {
+		t.Fatalf("error: %v", err)
+	}
+
+	if len(res.Embedding) != 384 {
+		t.Fatalf("expected 384 floats, got %d", len(res.Embedding))
+	}
+
+	if !floatsEqual64(res.Embedding[0], 0.06642947345972061) {
+		t.Fatalf("expected 0.06642947345972061, got %.16f", res.Embedding[0])
+	}
+}
+
+func TestAllMiniLMEmbed(t *testing.T) {
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	defer cancel()
+
+	req := api.EmbedRequest{
+		Model: "all-minilm",
+		Input: "why is the sky blue?",
+	}
+
+	res, err := embedTestHelper(ctx, t, req)
+
+	if err != nil {
+		t.Fatalf("error: %v", err)
+	}
+
+	if len(res.Embeddings) != 1 {
+		t.Fatalf("expected 1 embedding, got %d", len(res.Embeddings))
+	}
+
+	if len(res.Embeddings[0]) != 384 {
+		t.Fatalf("expected 384 floats, got %d", len(res.Embeddings[0]))
+	}
+
+	if !floatsEqual32(res.Embeddings[0][0], 0.010071031) {
+		t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
+	}
+
+	if res.PromptEvalCount != 8 {
+		t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
+	}
+}
+
+func TestAllMiniLMBatchEmbed(t *testing.T) {
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	defer cancel()
+
+	req := api.EmbedRequest{
+		Model: "all-minilm",
+		Input: []string{"why is the sky blue?", "why is the grass green?"},
+	}
+
+	res, err := embedTestHelper(ctx, t, req)
+
+	if err != nil {
+		t.Fatalf("error: %v", err)
+	}
+
+	if len(res.Embeddings) != 2 {
+		t.Fatalf("expected 2 embeddings, got %d", len(res.Embeddings))
+	}
+
+	if len(res.Embeddings[0]) != 384 {
+		t.Fatalf("expected 384 floats, got %d", len(res.Embeddings[0]))
+	}
+
+	if !floatsEqual32(res.Embeddings[0][0], 0.010071031) || !floatsEqual32(res.Embeddings[1][0], -0.009802706) {
+		t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
+	}
+
+	if res.PromptEvalCount != 16 {
+		t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
+	}
+}
+
+func TestAllMiniLMEmbedTruncate(t *testing.T) {
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	defer cancel()
+
+	truncTrue, truncFalse := true, false
+
+	type testReq struct {
+		Name    string
+		Request api.EmbedRequest
+	}
+
+	reqs := []testReq{
+		{
+			Name: "Target Truncation",
+			Request: api.EmbedRequest{
+				Model: "all-minilm",
+				Input: "why",
+			},
+		},
+		{
+			Name: "Default Truncate",
+			Request: api.EmbedRequest{
+				Model:   "all-minilm",
+				Input:   "why is the sky blue?",
+				Options: map[string]any{"num_ctx": 1},
+			},
+		},
+		{
+			Name: "Explicit Truncate",
+			Request: api.EmbedRequest{
+				Model:    "all-minilm",
+				Input:    "why is the sky blue?",
+				Truncate: &truncTrue,
+				Options:  map[string]any{"num_ctx": 1},
+			},
+		},
+	}
+
+	res := make(map[string]*api.EmbedResponse)
+
+	for _, req := range reqs {
+		response, err := embedTestHelper(ctx, t, req.Request)
+		if err != nil {
+			t.Fatalf("error: %v", err)
+		}
+		res[req.Name] = response
+	}
+
+	if res["Target Truncation"].Embeddings[0][0] != res["Default Truncate"].Embeddings[0][0] {
+		t.Fatal("expected default request to truncate correctly")
+	}
+
+	if res["Default Truncate"].Embeddings[0][0] != res["Explicit Truncate"].Embeddings[0][0] {
+		t.Fatal("expected default request and truncate true request to be the same")
+	}
+
+	// check that truncate set to false returns an error if context length is exceeded
+	_, err := embedTestHelper(ctx, t, api.EmbedRequest{
+		Model:    "all-minilm",
+		Input:    "why is the sky blue?",
+		Truncate: &truncFalse,
+		Options:  map[string]any{"num_ctx": 1},
+	})
+
+	if err == nil {
+		t.Fatal("expected error, got nil")
+	}
+}
+
+func embeddingTestHelper(ctx context.Context, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) {
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+	if err := PullIfMissing(ctx, client, req.Model); err != nil {
+		t.Fatalf("failed to pull model %s: %v", req.Model, err)
+	}
+
+	response, err := client.Embeddings(ctx, &req)
+
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func embedTestHelper(ctx context.Context, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+	if err := PullIfMissing(ctx, client, req.Model); err != nil {
+		t.Fatalf("failed to pull model %s: %v", req.Model, err)
+	}
+
+	response, err := client.Embed(ctx, &req)
+
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp
index 0ef3956e..d72bb1b1 100644
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -41,6 +41,7 @@
 
 #if defined(_WIN32)
 #include 
+#include 
 #endif
 
 #include 
@@ -1220,6 +1221,7 @@ struct llama_server_context
                 res.result_json = json
                 {
                     {"embedding", std::vector(embd, embd + n_embd)},
+                    {"timings",             slot.get_formated_timings()},
                 };
             }
         }
@@ -2437,15 +2439,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
             params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
             params.use_mmap = false;
         }
-        else if (arg == "--lora-base")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.lora_base = argv[i];
-        }
         else if (arg == "-v" || arg == "--verbose")
         {
             server_verbose = true;
@@ -2737,6 +2730,9 @@ int wmain(int argc, wchar_t **wargv) {
     for (int i = 0; i < argc; ++i) {
         argv[i] = wchar_to_char(wargv[i]);
     }
+
+    // Adjust error mode to avoid error dialog after we start.
+    SetErrorMode(SEM_FAILCRITICALERRORS);
 #else
 int main(int argc, char **argv) {
 #endif
@@ -3188,26 +3184,37 @@ int main(int argc, char **argv) {
                     prompt = "";
                 }
 
-                json image_data;
-                if (body.count("image_data") != 0) {
-                    image_data = body["image_data"];
-                }
-                else
-                {
-                    image_data = "";
+                if (prompt.size() == 1) {
+                    prompt = prompt[0];
                 }
 
                 // create and queue the task
-                const int task_id = llama.queue_tasks.get_new_id();
-                llama.queue_results.add_waiting_task_id(task_id);
-                llama.request_completion(task_id, { {"prompt", prompt}, { "n_predict", 0}, {"image_data", image_data} }, true, -1);
+                json responses;
+                {
+                    const int id_task = llama.queue_tasks.get_new_id();
+                    llama.queue_results.add_waiting_task_id(id_task);
+                    llama.request_completion(id_task, {{"prompt", prompt}}, true, -1);
 
-                // get the result
-                task_result result = llama.queue_results.recv(task_id);
-                llama.queue_results.remove_waiting_task_id(task_id);
+                    // get the result
+                    task_result result = llama.queue_results.recv(id_task);
+                    llama.queue_results.remove_waiting_task_id(id_task);
+                    if (result.error) {
+                        return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
+                    }
 
-                // send the result
-                return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
+                    responses = result.result_json.value("results", std::vector{result.result_json});
+                    json embeddings = json::array();
+
+                    int prompt_n = 0;
+                    for (auto & elem : responses) {
+                        embeddings.push_back(elem.at("embedding"));
+                        prompt_n += elem.at("timings").at("prompt_n").get();
+                    }
+
+                    // send the result
+                    json embedding_res = json{{"embedding", embeddings}, {"prompt_n", prompt_n}};
+                    return res.set_content(embedding_res.dump(), "application/json; charset=utf-8");
+                }
             });
 
     // GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!?
diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1
index beb964f9..d8bce92d 100644
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -7,8 +7,8 @@ function amdGPUs {
         return $env:AMDGPU_TARGETS
     }
     # Current supported rocblas list from ROCm v6.1.2 on windows
+    # https://rocm.docs.amd.com/projects/install-on-windows/en/latest/reference/system-requirements.html#windows-supported-gpus
     $GPU_LIST = @(
-        "gfx906:xnack-"
         "gfx1030"
         "gfx1100"
         "gfx1101"
diff --git a/llm/gguf.go b/llm/gguf.go
index 4d343a1b..a8427aed 100644
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -537,6 +537,7 @@ var ggufKVOrder = map[string][]string{
 		"tokenizer.ggml.add_bos_token",
 		"tokenizer.ggml.add_eos_token",
 		"tokenizer.chat_template",
+		"bert.pooling_type",
 	},
 }
 
diff --git a/llm/llama.cpp b/llm/llama.cpp
index a8db2a9c..6eeaeba1 160000
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
@@ -1 +1 @@
-Subproject commit a8db2a9ce64cd4417f6a312ab61858f17f0f8584
+Subproject commit 6eeaeba126ff701f3e8f79f246805b7023709972
diff --git a/llm/llm_darwin_amd64.go b/llm/llm_darwin_amd64.go
index 3093e1ad..60eed719 100644
--- a/llm/llm_darwin_amd64.go
+++ b/llm/llm_darwin_amd64.go
@@ -2,7 +2,10 @@ package llm
 
 import (
 	"embed"
+	"syscall"
 )
 
 //go:embed build/darwin/x86_64/*/bin/*
 var libEmbed embed.FS
+
+var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
diff --git a/llm/llm_darwin_arm64.go b/llm/llm_darwin_arm64.go
index 928f0b82..20ce8552 100644
--- a/llm/llm_darwin_arm64.go
+++ b/llm/llm_darwin_arm64.go
@@ -2,7 +2,10 @@ package llm
 
 import (
 	"embed"
+	"syscall"
 )
 
 //go:embed build/darwin/arm64/*/bin/*
 var libEmbed embed.FS
+
+var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
diff --git a/llm/llm_linux.go b/llm/llm_linux.go
index c2c5c4cb..928b4e79 100644
--- a/llm/llm_linux.go
+++ b/llm/llm_linux.go
@@ -1,6 +1,11 @@
 package llm
 
-import "embed"
+import (
+	"embed"
+	"syscall"
+)
 
 //go:embed build/linux/*/*/bin/*
 var libEmbed embed.FS
+
+var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
diff --git a/llm/llm_windows.go b/llm/llm_windows.go
index e44f4b95..763cccf9 100644
--- a/llm/llm_windows.go
+++ b/llm/llm_windows.go
@@ -1,6 +1,20 @@
 package llm
 
-import "embed"
+import (
+	"embed"
+	"syscall"
+)
 
 // unused on windows
 var libEmbed embed.FS
+
+const CREATE_DEFAULT_ERROR_MODE = 0x04000000
+
+var LlamaServerSysProcAttr = &syscall.SysProcAttr{
+	// Wire up the default error handling logic If for some reason a DLL is
+	// missing in the path this will pop up a GUI Dialog explaining the fault so
+	// the user can either fix their PATH, or report a bug. Without this
+	// setting, the process exits immediately with a generic exit status but no
+	// way to (easily) figure out what the actual missing DLL was.
+	CreationFlags: CREATE_DEFAULT_ERROR_MODE,
+}
diff --git a/llm/patches/05-default-pretokenizer.diff b/llm/patches/05-default-pretokenizer.diff
index 341a6f59..0d40fc3c 100644
--- a/llm/patches/05-default-pretokenizer.diff
+++ b/llm/patches/05-default-pretokenizer.diff
@@ -1,8 +1,8 @@
 diff --git a/src/llama.cpp b/src/llama.cpp
-index 2b9ace28..172640e2 100644
+index a207451f..2ddf431d 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -5357,16 +5357,7 @@ static void llm_load_vocab(
+@@ -5347,16 +5347,7 @@ static void llm_load_vocab(
          if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
              vocab.tokenizer_add_space_prefix = false;
              vocab.tokenizer_clean_spaces = true;
@@ -20,9 +20,9 @@ index 2b9ace28..172640e2 100644
                  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
              } else if (
                      tokenizer_pre == "llama3"   ||
-@@ -5439,7 +5430,8 @@ static void llm_load_vocab(
-                 tokenizer_pre == "jais") {
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
+@@ -5443,7 +5434,8 @@ static void llm_load_vocab(
+                 tokenizer_pre == "codeshell") {
+                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
              } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
diff --git a/llm/patches/07-embeddings.diff b/llm/patches/06-embeddings.diff
similarity index 100%
rename from llm/patches/07-embeddings.diff
rename to llm/patches/06-embeddings.diff
diff --git a/llm/patches/06-qwen2.diff b/llm/patches/06-qwen2.diff
deleted file mode 100644
index 1c7109f6..00000000
--- a/llm/patches/06-qwen2.diff
+++ /dev/null
@@ -1,13 +0,0 @@
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 40d2ec2c..f34eb79a 100644
---- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -6943,7 +6943,7 @@ static struct ggml_tensor * llm_build_kqv(
-         struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
-         cb(kq, "kq", il);
- 
--        if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
-+        if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) {
-             // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
-             // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
-             ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
diff --git a/llm/patches/08-clip-unicode.diff b/llm/patches/07-clip-unicode.diff
similarity index 100%
rename from llm/patches/08-clip-unicode.diff
rename to llm/patches/07-clip-unicode.diff
diff --git a/llm/patches/09-pooling.diff b/llm/patches/08-pooling.diff
similarity index 100%
rename from llm/patches/09-pooling.diff
rename to llm/patches/08-pooling.diff
diff --git a/llm/patches/09-lora.diff b/llm/patches/09-lora.diff
new file mode 100644
index 00000000..10c66d1d
--- /dev/null
+++ b/llm/patches/09-lora.diff
@@ -0,0 +1,358 @@
+diff --git a/common/common.cpp b/common/common.cpp
+index dbb724fb..c26fe6ee 100644
+--- a/common/common.cpp
++++ b/common/common.cpp
+@@ -2087,14 +2087,27 @@ std::tuple llama_init_from_gpt_par
+     for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
+         const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
+         float lora_scale = std::get<1>(params.lora_adapter[i]);
++
++        // try to load as gguf
+         auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
+         if (adapter == nullptr) {
+-            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+-            llama_free(lctx);
+-            llama_free_model(model);
+-            return std::make_tuple(nullptr, nullptr);
++            fprintf(stderr, "%s: error: failed to apply lora adapter, trying ggla\n", __func__);
++
++            // if that fails, try loading as ggla for compatibility
++            int err = llama_model_apply_lora_from_file(model,
++                                                    lora_adapter.c_str(),
++                                                    lora_scale,
++                                                    nullptr,
++                                                    params.n_threads);
++            if (err != 0) {
++                fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
++                llama_free(lctx);
++                llama_free_model(model);
++                return std::make_tuple(nullptr, nullptr);
++            }
++        } else {
++            llama_lora_adapter_set(lctx, adapter, lora_scale);
+         }
+-        llama_lora_adapter_set(lctx, adapter, lora_scale);
+     }
+ 
+     if (params.ignore_eos) {
+diff --git a/include/llama.h b/include/llama.h
+index 93fd77ca..b0fb37a6 100644
+--- a/include/llama.h
++++ b/include/llama.h
+@@ -1160,6 +1160,20 @@ extern "C" {
+ 
+     LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
+ 
++    // Apply a LoRA adapter to a loaded model
++    // path_base_model is the path to a higher quality model to use as a base for
++    // the layers modified by the adapter. Can be NULL to use the current loaded model.
++    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
++    // will be applied on top of the previous one
++    // Returns 0 on success
++    LLAMA_API int32_t llama_model_apply_lora_from_file(
++            const struct llama_model * model,
++                            const char * path_lora,
++                                float   scale,
++                            const char * path_base_model,
++                                int32_t   n_threads);
++
++
+ #ifdef __cplusplus
+ }
+ #endif
+diff --git a/src/llama.cpp b/src/llama.cpp
+index 80a0dd0f..9d7b0e17 100644
+--- a/src/llama.cpp
++++ b/src/llama.cpp
+@@ -21880,3 +21880,290 @@ static void llama_log_callback_default(ggml_log_level level, const char * text,
+     fputs(text, stderr);
+     fflush(stderr);
+ }
++
++static int llama_apply_lora_from_file_internal(
++    const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
++) {
++    LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
++
++    const int64_t t_start_lora_us = ggml_time_us();
++
++    llama_file fin(path_lora, "rb");
++
++    // verify magic and version
++    {
++        uint32_t magic = fin.read_u32();
++        if (magic != LLAMA_FILE_MAGIC_GGLA) {
++            LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
++            return 1;
++        }
++
++        uint32_t format_version = fin.read_u32();
++        if (format_version != 1) {
++            LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
++            return 1;
++        }
++    }
++
++    int32_t lora_r = fin.read_u32();
++    int32_t lora_alpha = fin.read_u32();
++    float scaling = scale * (float)lora_alpha / (float)lora_r;
++
++    LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
++
++    // load base model
++    std::unique_ptr ml;
++    if (path_base_model) {
++        LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
++        ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
++        ml->init_mappings(/*prefetch*/ false); // no prefetching
++    }
++
++    struct tensor_meta {
++        std::string name;
++        ggml_type type;
++        int32_t ne[2];
++        size_t offset;
++    };
++    std::map tensor_meta_map;
++
++    // load all tensor meta
++    while (true) {
++        if (fin.tell() == fin.size) {
++            // eof
++            break;
++        }
++
++        int32_t n_dims;
++        int32_t name_len;
++        int32_t ftype;
++
++        fin.read_raw(&n_dims, sizeof(n_dims));
++        fin.read_raw(&name_len, sizeof(name_len));
++        fin.read_raw(&ftype, sizeof(ftype));
++
++        if (n_dims != 1 && n_dims != 2) {
++            LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
++            return 1;
++        }
++
++        int32_t ne[2] = { 1, 1 };
++        for (int i = 0; i < n_dims; ++i) {
++            fin.read_raw(&ne[i], sizeof(ne[i]));
++        }
++
++        std::string name;
++        {
++            GGML_ASSERT(name_len < GGML_MAX_NAME);
++            char buf[GGML_MAX_NAME];
++            fin.read_raw(buf, name_len);
++            name = std::string(buf, name_len);
++        }
++
++        // check for lora suffix
++        std::string lora_suffix;
++        if (name.length() > 6) {
++            lora_suffix = name.substr(name.length() - 6);
++        }
++        if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
++            LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
++            return 1;
++        }
++
++        // tensor type
++        ggml_type wtype;
++        switch (ftype) {
++            case 0: wtype = GGML_TYPE_F32;  break;
++            case 1: wtype = GGML_TYPE_F16;  break;
++            default:
++                    {
++                        LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
++                                __func__, ftype);
++                        return 1;
++                    }
++        }
++
++        // data offset
++        size_t offset = fin.tell();
++        offset = (offset + 31) & -32;
++
++        // skip tensor data
++        fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
++
++        tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
++    }
++
++    bool warned = false;
++    int n_tensors = 0;
++
++    // apply
++    ggml_backend_t backend_cpu = ggml_backend_cpu_init();
++    if (backend_cpu == nullptr) {
++        LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
++        return 1;
++    }
++    ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
++
++    std::vector> read_buf;
++    for (const auto & it : model.tensors_by_name) {
++        const std::string & base_name = it.first;
++        ggml_tensor * model_t = it.second;
++
++        if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
++            tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
++            continue;
++        }
++
++        tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
++        tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
++
++        ggml_init_params lora_init_params = {
++            /* .mem_size   */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
++            /* .mem_buffer */ nullptr,
++            /* .no_alloc   */ true,
++        };
++        ggml_context * lora_ctx = ggml_init(lora_init_params);
++        if (lora_ctx == nullptr) {
++            LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
++            ggml_backend_free(backend_cpu);
++            return 1;
++        }
++
++        // create tensors
++        ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
++        ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
++        ggml_set_name(loraA, metaA.name.c_str());
++        ggml_set_name(loraB, metaB.name.c_str());
++
++        ggml_tensor * base_t;
++        if (ml) {
++            if (!ml->get_tensor_meta(base_name.c_str())) {
++                LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
++                return 1;
++            }
++            base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
++        } else {
++            base_t = ggml_dup_tensor(lora_ctx, model_t);
++        }
++        ggml_set_name(base_t, base_name.c_str());
++
++        // allocate in backend buffer
++        ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
++        if (lora_buf == nullptr) {
++            LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
++            return 1;
++        }
++
++        // load tensor data
++        auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
++            read_buf.resize(ggml_nbytes(tensor));
++            fin.seek(tensor_meta.offset, SEEK_SET);
++            fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
++            ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
++        };
++        load_tensor(metaA, loraA);
++        load_tensor(metaB, loraB);
++
++        // load base model tensor data
++        if (ml) {
++            ml->load_data_for(base_t);
++        } else {
++            ggml_backend_tensor_copy(model_t, base_t);
++        }
++
++        if (ggml_is_quantized(base_t->type) && !warned) {
++            LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
++                            "use a f16 or f32 base model with --lora-base\n", __func__);
++            warned = true;
++        }
++
++        if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
++            LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
++                            " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
++            ggml_free(lora_ctx);
++            ggml_backend_buffer_free(lora_buf);
++            ggml_backend_free(backend_cpu);
++            return 1;
++        }
++
++        auto build_lora_graph = [&]() {
++            // w = w + BA*s
++            ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
++            ggml_set_name(BA, "BA");
++
++            if (scaling != 1.0f) {
++                BA = ggml_scale(lora_ctx, BA, scaling);
++                ggml_set_name(BA, "BA_scaled");
++            }
++
++            ggml_tensor * r;
++            r = ggml_add_inplace(lora_ctx, base_t, BA);
++            ggml_set_name(r, "r_add");
++
++            if (base_t->type != model_t->type) {
++                // convert the result to the model type
++                r = ggml_cast(lora_ctx, r, model_t->type);
++                ggml_set_name(r, "r_cast");
++            }
++
++            return r;
++        };
++
++        ggml_cgraph * gf = ggml_new_graph(lora_ctx);
++        ggml_tensor * r = build_lora_graph();
++        ggml_build_forward_expand(gf, r);
++
++        ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
++        if (graph_buf == nullptr) {
++            LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
++            ggml_free(lora_ctx);
++            ggml_backend_buffer_free(lora_buf);
++            ggml_backend_free(backend_cpu);
++            return 1;
++        }
++
++        ggml_backend_graph_compute(backend_cpu, gf);
++
++        ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
++
++#if 0
++        // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
++        //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
++
++        // sched compute
++        ggml_build_forward_expand(gf, build_graph());
++        ggml_backend_sched_init_measure(sched, gf);
++
++        // create the graph again, since the previous one was destroyed by the measure
++        ggml_graph_clear(gf);
++        ggml_build_forward_expand(gf, build_graph());
++        ggml_backend_sched_graph_compute(sched, gf);
++        ggml_backend_sched_free(sched);
++#endif
++
++        ggml_backend_buffer_free(lora_buf);
++        ggml_backend_buffer_free(graph_buf);
++        ggml_free(lora_ctx);
++
++        n_tensors++;
++        if (n_tensors % 4 == 0) {
++            LLAMA_LOG_INFO(".");
++        }
++    }
++
++    ggml_backend_free(backend_cpu);
++
++    const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
++    LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
++
++    return 0;
++}
++
++int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
++    try {
++        return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
++    } catch (const std::exception & err) {
++        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
++        return 1;
++    }
++}
+\ No newline at end of file
diff --git a/llm/server.go b/llm/server.go
index 8f37aa23..afde077e 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -33,7 +33,7 @@ type LlamaServer interface {
 	Ping(ctx context.Context) error
 	WaitUntilRunning(ctx context.Context) error
 	Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
-	Embedding(ctx context.Context, prompt string) ([]float64, error)
+	Embed(ctx context.Context, input []string) (*EmbedResponse, error)
 	Tokenize(ctx context.Context, content string) ([]int, error)
 	Detokenize(ctx context.Context, tokens []int) (string, error)
 	Close() error
@@ -127,7 +127,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	// On linux, over-allocating CPU memory will almost always result in an error
 	if runtime.GOOS == "linux" {
 		systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize
-		available := min(systemTotalMemory, systemFreeMemory+systemSwapFreeMemory)
+		available := systemFreeMemory + systemSwapFreeMemory
 		if systemMemoryRequired > available {
 			slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", available, "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "swap", format.HumanBytes2(systemSwapFreeMemory))
 			return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
@@ -346,6 +346,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		s.cmd.Env = os.Environ()
 		s.cmd.Stdout = os.Stdout
 		s.cmd.Stderr = s.status
+		s.cmd.SysProcAttr = LlamaServerSysProcAttr
 
 		envWorkarounds := [][2]string{}
 		for _, gpu := range gpus {
@@ -385,8 +386,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 			filteredEnv := []string{}
 			for _, ev := range s.cmd.Env {
 				if strings.HasPrefix(ev, "CUDA_") ||
+					strings.HasPrefix(ev, "ROCR_") ||
 					strings.HasPrefix(ev, "ROCM_") ||
 					strings.HasPrefix(ev, "HIP_") ||
+					strings.HasPrefix(ev, "GPU_") ||
 					strings.HasPrefix(ev, "HSA_") ||
 					strings.HasPrefix(ev, "GGML_") ||
 					strings.HasPrefix(ev, "PATH=") ||
@@ -415,7 +418,17 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
 		// reap subprocess when it exits
 		go func() {
-			s.done <- s.cmd.Wait()
+			err := s.cmd.Wait()
+			// Favor a more detailed message over the process exit status
+			if err != nil && s.status != nil && s.status.LastErrMsg != "" {
+				slog.Debug("llama runner terminated", "error", err)
+				if strings.Contains(s.status.LastErrMsg, "unknown model") {
+					s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade"
+				}
+				s.done <- fmt.Errorf(s.status.LastErrMsg)
+			} else {
+				s.done <- err
+			}
 		}()
 
 		return s, nil
@@ -578,14 +591,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 			slog.Warn("client connection closed before server finished loading, aborting load")
 			return fmt.Errorf("timed out waiting for llama runner to start: %w", ctx.Err())
 		case err := <-s.done:
-			msg := ""
-			if s.status != nil && s.status.LastErrMsg != "" {
-				msg = s.status.LastErrMsg
-			}
-			if strings.Contains(msg, "unknown model") {
-				return fmt.Errorf("this model is not supported by your version of Ollama. You may need to upgrade")
-			}
-			return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
+			return fmt.Errorf("llama runner process has terminated: %w", err)
 		default:
 		}
 		if time.Now().After(stallTimer) {
@@ -721,6 +727,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 		"temperature":       req.Options.Temperature,
 		"top_k":             req.Options.TopK,
 		"top_p":             req.Options.TopP,
+		"min_p":             req.Options.MinP,
 		"tfs_z":             req.Options.TFSZ,
 		"typical_p":         req.Options.TypicalP,
 		"repeat_last_n":     req.Options.RepeatLastN,
@@ -867,15 +874,16 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 	return nil
 }
 
-type EmbeddingRequest struct {
-	Content string `json:"content"`
+type EmbedRequest struct {
+	Content []string `json:"content"`
 }
 
-type EmbeddingResponse struct {
-	Embedding []float64 `json:"embedding"`
+type EmbedResponse struct {
+	Embedding       [][]float32 `json:"embedding"`
+	PromptEvalCount int         `json:"prompt_n"`
 }
 
-func (s *llmServer) Embedding(ctx context.Context, prompt string) ([]float64, error) {
+func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse, error) {
 	if err := s.sem.Acquire(ctx, 1); err != nil {
 		slog.Error("Failed to acquire semaphore", "error", err)
 		return nil, err
@@ -890,7 +898,7 @@ func (s *llmServer) Embedding(ctx context.Context, prompt string) ([]float64, er
 		return nil, fmt.Errorf("unexpected server status: %s", status.ToString())
 	}
 
-	data, err := json.Marshal(TokenizeRequest{Content: prompt})
+	data, err := json.Marshal(EmbedRequest{Content: input})
 	if err != nil {
 		return nil, fmt.Errorf("error marshaling embed data: %w", err)
 	}
@@ -917,12 +925,12 @@ func (s *llmServer) Embedding(ctx context.Context, prompt string) ([]float64, er
 		return nil, fmt.Errorf("%s", body)
 	}
 
-	var embedding EmbeddingResponse
-	if err := json.Unmarshal(body, &embedding); err != nil {
+	var e EmbedResponse
+	if err := json.Unmarshal(body, &e); err != nil {
 		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
 	}
 
-	return embedding.Embedding, nil
+	return &e, nil
 }
 
 type TokenizeRequest struct {
diff --git a/macapp/src/app.tsx b/macapp/src/app.tsx
index ab17df60..a627e63d 100644
--- a/macapp/src/app.tsx
+++ b/macapp/src/app.tsx
@@ -19,7 +19,7 @@ export default function () {
   const [step, setStep] = useState(Step.WELCOME)
   const [commandCopied, setCommandCopied] = useState(false)
 
-  const command = 'ollama run llama3'
+  const command = 'ollama run llama3.1'
 
   return (
     
diff --git a/openai/openai.go b/openai/openai.go index 1707da14..5bd80660 100644 --- a/openai/openai.go +++ b/openai/openai.go @@ -3,11 +3,14 @@ package openai import ( "bytes" + "encoding/base64" "encoding/json" "fmt" "io" + "log/slog" "math/rand" "net/http" + "strings" "time" "github.com/gin-gonic/gin" @@ -27,8 +30,9 @@ type ErrorResponse struct { } type Message struct { - Role string `json:"role"` - Content string `json:"content"` + Role string `json:"role"` + Content any `json:"content"` + ToolCalls []ToolCall `json:"tool_calls,omitempty"` } type Choice struct { @@ -59,6 +63,11 @@ type ResponseFormat struct { Type string `json:"type"` } +type EmbedRequest struct { + Input any `json:"input"` + Model string `json:"model"` +} + type ChatCompletionRequest struct { Model string `json:"model"` Messages []Message `json:"messages"` @@ -71,6 +80,7 @@ type ChatCompletionRequest struct { PresencePenalty *float64 `json:"presence_penalty_penalty"` TopP *float64 `json:"top_p"` ResponseFormat *ResponseFormat `json:"response_format"` + Tools []api.Tool `json:"tools"` } type ChatCompletion struct { @@ -104,6 +114,7 @@ type CompletionRequest struct { Stream bool `json:"stream"` Temperature *float32 `json:"temperature"` TopP float32 `json:"top_p"` + Suffix string `json:"suffix"` } type Completion struct { @@ -125,6 +136,15 @@ type CompletionChunk struct { SystemFingerprint string `json:"system_fingerprint"` } +type ToolCall struct { + ID string `json:"id"` + Type string `json:"type"` + Function struct { + Name string `json:"name"` + Arguments string `json:"arguments"` + } `json:"function"` +} + type Model struct { Id string `json:"id"` Object string `json:"object"` @@ -132,11 +152,23 @@ type Model struct { OwnedBy string `json:"owned_by"` } +type Embedding struct { + Object string `json:"object"` + Embedding []float32 `json:"embedding"` + Index int `json:"index"` +} + type ListCompletion struct { Object string `json:"object"` Data []Model `json:"data"` } +type EmbeddingList struct { + Object string `json:"object"` + Data []Embedding `json:"data"` + Model string `json:"model"` +} + func NewError(code int, message string) ErrorResponse { var etype string switch code { @@ -151,7 +183,31 @@ func NewError(code int, message string) ErrorResponse { return ErrorResponse{Error{Type: etype, Message: message}} } +func toolCallId() string { + const letterBytes = "abcdefghijklmnopqrstuvwxyz0123456789" + b := make([]byte, 8) + for i := range b { + b[i] = letterBytes[rand.Intn(len(letterBytes))] + } + return "call_" + strings.ToLower(string(b)) +} + func toChatCompletion(id string, r api.ChatResponse) ChatCompletion { + toolCalls := make([]ToolCall, len(r.Message.ToolCalls)) + for i, tc := range r.Message.ToolCalls { + toolCalls[i].ID = toolCallId() + toolCalls[i].Type = "function" + toolCalls[i].Function.Name = tc.Function.Name + + args, err := json.Marshal(tc.Function.Arguments) + if err != nil { + slog.Error("could not marshall function arguments to json", "error", err) + continue + } + + toolCalls[i].Function.Arguments = string(args) + } + return ChatCompletion{ Id: id, Object: "chat.completion", @@ -160,8 +216,11 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion { SystemFingerprint: "fp_ollama", Choices: []Choice{{ Index: 0, - Message: Message{Role: r.Message.Role, Content: r.Message.Content}, + Message: Message{Role: r.Message.Role, Content: r.Message.Content, ToolCalls: toolCalls}, FinishReason: func(reason string) *string { + if len(toolCalls) > 0 { + reason = "tool_calls" + } if len(reason) > 0 { return &reason } @@ -169,7 +228,6 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion { }(r.DoneReason), }}, Usage: Usage{ - // TODO: ollama returns 0 for prompt eval if the prompt was cached, but openai returns the actual count PromptTokens: r.PromptEvalCount, CompletionTokens: r.EvalCount, TotalTokens: r.PromptEvalCount + r.EvalCount, @@ -215,7 +273,6 @@ func toCompletion(id string, r api.GenerateResponse) Completion { }(r.DoneReason), }}, Usage: Usage{ - // TODO: ollama returns 0 for prompt eval if the prompt was cached, but openai returns the actual count PromptTokens: r.PromptEvalCount, CompletionTokens: r.EvalCount, TotalTokens: r.PromptEvalCount + r.EvalCount, @@ -260,6 +317,27 @@ func toListCompletion(r api.ListResponse) ListCompletion { } } +func toEmbeddingList(model string, r api.EmbedResponse) EmbeddingList { + if r.Embeddings != nil { + var data []Embedding + for i, e := range r.Embeddings { + data = append(data, Embedding{ + Object: "embedding", + Embedding: e, + Index: i, + }) + } + + return EmbeddingList{ + Object: "list", + Data: data, + Model: model, + } + } + + return EmbeddingList{} +} + func toModel(r api.ShowResponse, m string) Model { return Model{ Id: m, @@ -269,10 +347,77 @@ func toModel(r api.ShowResponse, m string) Model { } } -func fromChatRequest(r ChatCompletionRequest) api.ChatRequest { +func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { var messages []api.Message for _, msg := range r.Messages { - messages = append(messages, api.Message{Role: msg.Role, Content: msg.Content}) + switch content := msg.Content.(type) { + case string: + messages = append(messages, api.Message{Role: msg.Role, Content: content}) + case []any: + for _, c := range content { + data, ok := c.(map[string]any) + if !ok { + return nil, fmt.Errorf("invalid message format") + } + switch data["type"] { + case "text": + text, ok := data["text"].(string) + if !ok { + return nil, fmt.Errorf("invalid message format") + } + messages = append(messages, api.Message{Role: msg.Role, Content: text}) + case "image_url": + var url string + if urlMap, ok := data["image_url"].(map[string]any); ok { + if url, ok = urlMap["url"].(string); !ok { + return nil, fmt.Errorf("invalid message format") + } + } else { + if url, ok = data["image_url"].(string); !ok { + return nil, fmt.Errorf("invalid message format") + } + } + + types := []string{"jpeg", "jpg", "png"} + valid := false + for _, t := range types { + prefix := "data:image/" + t + ";base64," + if strings.HasPrefix(url, prefix) { + url = strings.TrimPrefix(url, prefix) + valid = true + break + } + } + + if !valid { + return nil, fmt.Errorf("invalid image input") + } + + img, err := base64.StdEncoding.DecodeString(url) + if err != nil { + return nil, fmt.Errorf("invalid message format") + } + + messages = append(messages, api.Message{Role: msg.Role, Images: []api.ImageData{img}}) + default: + return nil, fmt.Errorf("invalid message format") + } + } + default: + if msg.ToolCalls == nil { + return nil, fmt.Errorf("invalid message content type: %T", content) + } + + toolCalls := make([]api.ToolCall, len(msg.ToolCalls)) + for i, tc := range msg.ToolCalls { + toolCalls[i].Function.Name = tc.Function.Name + err := json.Unmarshal([]byte(tc.Function.Arguments), &toolCalls[i].Function.Arguments) + if err != nil { + return nil, fmt.Errorf("invalid tool call arguments") + } + } + messages = append(messages, api.Message{Role: msg.Role, ToolCalls: toolCalls}) + } } options := make(map[string]interface{}) @@ -323,13 +468,14 @@ func fromChatRequest(r ChatCompletionRequest) api.ChatRequest { format = "json" } - return api.ChatRequest{ + return &api.ChatRequest{ Model: r.Model, Messages: messages, Format: format, Options: options, Stream: &r.Stream, - } + Tools: r.Tools, + }, nil } func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) { @@ -379,6 +525,7 @@ func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) { Prompt: r.Prompt, Options: options, Stream: &r.Stream, + Suffix: r.Suffix, }, nil } @@ -407,6 +554,11 @@ type RetrieveWriter struct { model string } +type EmbedWriter struct { + BaseWriter + model string +} + func (w *BaseWriter) writeError(code int, data []byte) (int, error) { var serr api.StatusError err := json.Unmarshal(data, &serr) @@ -572,6 +724,33 @@ func (w *RetrieveWriter) Write(data []byte) (int, error) { return w.writeResponse(data) } +func (w *EmbedWriter) writeResponse(data []byte) (int, error) { + var embedResponse api.EmbedResponse + err := json.Unmarshal(data, &embedResponse) + + if err != nil { + return 0, err + } + + w.ResponseWriter.Header().Set("Content-Type", "application/json") + err = json.NewEncoder(w.ResponseWriter).Encode(toEmbeddingList(w.model, embedResponse)) + + if err != nil { + return 0, err + } + + return len(data), nil +} + +func (w *EmbedWriter) Write(data []byte) (int, error) { + code := w.ResponseWriter.Status() + if code != http.StatusOK { + return w.writeError(code, data) + } + + return w.writeResponse(data) +} + func ListMiddleware() gin.HandlerFunc { return func(c *gin.Context) { w := &ListWriter{ @@ -635,6 +814,47 @@ func CompletionsMiddleware() gin.HandlerFunc { id: fmt.Sprintf("cmpl-%d", rand.Intn(999)), } + c.Writer = w + c.Next() + } +} + +func EmbeddingsMiddleware() gin.HandlerFunc { + return func(c *gin.Context) { + var req EmbedRequest + err := c.ShouldBindJSON(&req) + if err != nil { + c.AbortWithStatusJSON(http.StatusBadRequest, NewError(http.StatusBadRequest, err.Error())) + return + } + + if req.Input == "" { + req.Input = []string{""} + } + + if req.Input == nil { + c.AbortWithStatusJSON(http.StatusBadRequest, NewError(http.StatusBadRequest, "invalid input")) + return + } + + if v, ok := req.Input.([]any); ok && len(v) == 0 { + c.AbortWithStatusJSON(http.StatusBadRequest, NewError(http.StatusBadRequest, "invalid input")) + return + } + + var b bytes.Buffer + if err := json.NewEncoder(&b).Encode(api.EmbedRequest{Model: req.Model, Input: req.Input}); err != nil { + c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error())) + return + } + + c.Request.Body = io.NopCloser(&b) + + w := &EmbedWriter{ + BaseWriter: BaseWriter{ResponseWriter: c.Writer}, + model: req.Model, + } + c.Writer = w c.Next() @@ -656,7 +876,14 @@ func ChatMiddleware() gin.HandlerFunc { } var b bytes.Buffer - if err := json.NewEncoder(&b).Encode(fromChatRequest(req)); err != nil { + + chatReq, err := fromChatRequest(req) + if err != nil { + c.AbortWithStatusJSON(http.StatusBadRequest, NewError(http.StatusBadRequest, err.Error())) + return + } + + if err := json.NewEncoder(&b).Encode(chatReq); err != nil { c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error())) return } diff --git a/openai/openai_test.go b/openai/openai_test.go index 5f1ae52e..f978d46c 100644 --- a/openai/openai_test.go +++ b/openai/openai_test.go @@ -2,6 +2,7 @@ package openai import ( "bytes" + "encoding/base64" "encoding/json" "io" "net/http" @@ -15,64 +16,199 @@ import ( "github.com/stretchr/testify/assert" ) -func TestMiddlewareRequests(t *testing.T) { +const prefix = `data:image/jpeg;base64,` +const image = `iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=` +const imageURL = prefix + image + +func prepareRequest(req *http.Request, body any) { + bodyBytes, _ := json.Marshal(body) + req.Body = io.NopCloser(bytes.NewReader(bodyBytes)) + req.Header.Set("Content-Type", "application/json") +} + +func captureRequestMiddleware(capturedRequest any) gin.HandlerFunc { + return func(c *gin.Context) { + bodyBytes, _ := io.ReadAll(c.Request.Body) + c.Request.Body = io.NopCloser(bytes.NewReader(bodyBytes)) + err := json.Unmarshal(bodyBytes, capturedRequest) + if err != nil { + c.AbortWithStatusJSON(http.StatusInternalServerError, "failed to unmarshal request") + } + c.Next() + } +} + +func TestChatMiddleware(t *testing.T) { type testCase struct { Name string - Method string - Path string - Handler func() gin.HandlerFunc Setup func(t *testing.T, req *http.Request) - Expected func(t *testing.T, req *http.Request) + Expected func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) } - var capturedRequest *http.Request - - captureRequestMiddleware := func() gin.HandlerFunc { - return func(c *gin.Context) { - bodyBytes, _ := io.ReadAll(c.Request.Body) - c.Request.Body = io.NopCloser(bytes.NewReader(bodyBytes)) - capturedRequest = c.Request - c.Next() - } - } + var capturedRequest *api.ChatRequest testCases := []testCase{ { - Name: "chat handler", - Method: http.MethodPost, - Path: "/api/chat", - Handler: ChatMiddleware, + Name: "chat handler", Setup: func(t *testing.T, req *http.Request) { body := ChatCompletionRequest{ Model: "test-model", Messages: []Message{{Role: "user", Content: "Hello"}}, } - - bodyBytes, _ := json.Marshal(body) - - req.Body = io.NopCloser(bytes.NewReader(bodyBytes)) - req.Header.Set("Content-Type", "application/json") + prepareRequest(req, body) }, - Expected: func(t *testing.T, req *http.Request) { - var chatReq api.ChatRequest - if err := json.NewDecoder(req.Body).Decode(&chatReq); err != nil { - t.Fatal(err) + Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) { + if resp.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", resp.Code) } - if chatReq.Messages[0].Role != "user" { - t.Fatalf("expected 'user', got %s", chatReq.Messages[0].Role) + if req.Messages[0].Role != "user" { + t.Fatalf("expected 'user', got %s", req.Messages[0].Role) } - if chatReq.Messages[0].Content != "Hello" { - t.Fatalf("expected 'Hello', got %s", chatReq.Messages[0].Content) + if req.Messages[0].Content != "Hello" { + t.Fatalf("expected 'Hello', got %s", req.Messages[0].Content) } }, }, { - Name: "completions handler", - Method: http.MethodPost, - Path: "/api/generate", - Handler: CompletionsMiddleware, + Name: "chat handler with image content", + Setup: func(t *testing.T, req *http.Request) { + body := ChatCompletionRequest{ + Model: "test-model", + Messages: []Message{ + { + Role: "user", Content: []map[string]any{ + {"type": "text", "text": "Hello"}, + {"type": "image_url", "image_url": map[string]string{"url": imageURL}}, + }, + }, + }, + } + prepareRequest(req, body) + }, + Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) { + if resp.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", resp.Code) + } + + if req.Messages[0].Role != "user" { + t.Fatalf("expected 'user', got %s", req.Messages[0].Role) + } + + if req.Messages[0].Content != "Hello" { + t.Fatalf("expected 'Hello', got %s", req.Messages[0].Content) + } + + img, _ := base64.StdEncoding.DecodeString(imageURL[len(prefix):]) + + if req.Messages[1].Role != "user" { + t.Fatalf("expected 'user', got %s", req.Messages[1].Role) + } + + if !bytes.Equal(req.Messages[1].Images[0], img) { + t.Fatalf("expected image encoding, got %s", req.Messages[1].Images[0]) + } + }, + }, + { + Name: "chat handler with tools", + Setup: func(t *testing.T, req *http.Request) { + body := ChatCompletionRequest{ + Model: "test-model", + Messages: []Message{ + {Role: "user", Content: "What's the weather like in Paris Today?"}, + {Role: "assistant", ToolCalls: []ToolCall{{ + ID: "id", + Type: "function", + Function: struct { + Name string `json:"name"` + Arguments string `json:"arguments"` + }{ + Name: "get_current_weather", + Arguments: "{\"location\": \"Paris, France\", \"format\": \"celsius\"}", + }, + }}}, + }, + } + prepareRequest(req, body) + }, + Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) { + if resp.Code != 200 { + t.Fatalf("expected 200, got %d", resp.Code) + } + + if req.Messages[0].Content != "What's the weather like in Paris Today?" { + t.Fatalf("expected What's the weather like in Paris Today?, got %s", req.Messages[0].Content) + } + + if req.Messages[1].ToolCalls[0].Function.Arguments["location"] != "Paris, France" { + t.Fatalf("expected 'Paris, France', got %v", req.Messages[1].ToolCalls[0].Function.Arguments["location"]) + } + + if req.Messages[1].ToolCalls[0].Function.Arguments["format"] != "celsius" { + t.Fatalf("expected celsius, got %v", req.Messages[1].ToolCalls[0].Function.Arguments["format"]) + } + }, + }, + { + Name: "chat handler error forwarding", + Setup: func(t *testing.T, req *http.Request) { + body := ChatCompletionRequest{ + Model: "test-model", + Messages: []Message{{Role: "user", Content: 2}}, + } + prepareRequest(req, body) + }, + Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) { + if resp.Code != http.StatusBadRequest { + t.Fatalf("expected 400, got %d", resp.Code) + } + + if !strings.Contains(resp.Body.String(), "invalid message content type") { + t.Fatalf("error was not forwarded") + } + }, + }, + } + + endpoint := func(c *gin.Context) { + c.Status(http.StatusOK) + } + + gin.SetMode(gin.TestMode) + router := gin.New() + router.Use(ChatMiddleware(), captureRequestMiddleware(&capturedRequest)) + router.Handle(http.MethodPost, "/api/chat", endpoint) + + for _, tc := range testCases { + t.Run(tc.Name, func(t *testing.T) { + req, _ := http.NewRequest(http.MethodPost, "/api/chat", nil) + + tc.Setup(t, req) + + resp := httptest.NewRecorder() + router.ServeHTTP(resp, req) + + tc.Expected(t, capturedRequest, resp) + + capturedRequest = nil + }) + } +} + +func TestCompletionsMiddleware(t *testing.T) { + type testCase struct { + Name string + Setup func(t *testing.T, req *http.Request) + Expected func(t *testing.T, req *api.GenerateRequest, resp *httptest.ResponseRecorder) + } + + var capturedRequest *api.GenerateRequest + + testCases := []testCase{ + { + Name: "completions handler", Setup: func(t *testing.T, req *http.Request) { temp := float32(0.8) body := CompletionRequest{ @@ -80,28 +216,20 @@ func TestMiddlewareRequests(t *testing.T) { Prompt: "Hello", Temperature: &temp, Stop: []string{"\n", "stop"}, + Suffix: "suffix", } - - bodyBytes, _ := json.Marshal(body) - - req.Body = io.NopCloser(bytes.NewReader(bodyBytes)) - req.Header.Set("Content-Type", "application/json") + prepareRequest(req, body) }, - Expected: func(t *testing.T, req *http.Request) { - var genReq api.GenerateRequest - if err := json.NewDecoder(req.Body).Decode(&genReq); err != nil { - t.Fatal(err) + Expected: func(t *testing.T, req *api.GenerateRequest, resp *httptest.ResponseRecorder) { + if req.Prompt != "Hello" { + t.Fatalf("expected 'Hello', got %s", req.Prompt) } - if genReq.Prompt != "Hello" { - t.Fatalf("expected 'Hello', got %s", genReq.Prompt) + if req.Options["temperature"] != 1.6 { + t.Fatalf("expected 1.6, got %f", req.Options["temperature"]) } - if genReq.Options["temperature"] != 1.6 { - t.Fatalf("expected 1.6, got %f", genReq.Options["temperature"]) - } - - stopTokens, ok := genReq.Options["stop"].([]any) + stopTokens, ok := req.Options["stop"].([]any) if !ok { t.Fatalf("expected stop tokens to be a list") @@ -110,33 +238,160 @@ func TestMiddlewareRequests(t *testing.T) { if stopTokens[0] != "\n" || stopTokens[1] != "stop" { t.Fatalf("expected ['\\n', 'stop'], got %v", stopTokens) } + + if req.Suffix != "suffix" { + t.Fatalf("expected 'suffix', got %s", req.Suffix) + } + }, + }, + { + Name: "completions handler error forwarding", + Setup: func(t *testing.T, req *http.Request) { + body := CompletionRequest{ + Model: "test-model", + Prompt: "Hello", + Temperature: nil, + Stop: []int{1, 2}, + Suffix: "suffix", + } + prepareRequest(req, body) + }, + Expected: func(t *testing.T, req *api.GenerateRequest, resp *httptest.ResponseRecorder) { + if resp.Code != http.StatusBadRequest { + t.Fatalf("expected 400, got %d", resp.Code) + } + + if !strings.Contains(resp.Body.String(), "invalid type for 'stop' field") { + t.Fatalf("error was not forwarded") + } }, }, } - gin.SetMode(gin.TestMode) - router := gin.New() - endpoint := func(c *gin.Context) { c.Status(http.StatusOK) } + gin.SetMode(gin.TestMode) + router := gin.New() + router.Use(CompletionsMiddleware(), captureRequestMiddleware(&capturedRequest)) + router.Handle(http.MethodPost, "/api/generate", endpoint) + for _, tc := range testCases { t.Run(tc.Name, func(t *testing.T) { - router = gin.New() - router.Use(captureRequestMiddleware()) - router.Use(tc.Handler()) - router.Handle(tc.Method, tc.Path, endpoint) - req, _ := http.NewRequest(tc.Method, tc.Path, nil) + req, _ := http.NewRequest(http.MethodPost, "/api/generate", nil) - if tc.Setup != nil { - tc.Setup(t, req) - } + tc.Setup(t, req) resp := httptest.NewRecorder() router.ServeHTTP(resp, req) - tc.Expected(t, capturedRequest) + tc.Expected(t, capturedRequest, resp) + + capturedRequest = nil + }) + } +} + +func TestEmbeddingsMiddleware(t *testing.T) { + type testCase struct { + Name string + Setup func(t *testing.T, req *http.Request) + Expected func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder) + } + + var capturedRequest *api.EmbedRequest + + testCases := []testCase{ + { + Name: "embed handler single input", + Setup: func(t *testing.T, req *http.Request) { + body := EmbedRequest{ + Input: "Hello", + Model: "test-model", + } + prepareRequest(req, body) + }, + Expected: func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder) { + if req.Input != "Hello" { + t.Fatalf("expected 'Hello', got %s", req.Input) + } + + if req.Model != "test-model" { + t.Fatalf("expected 'test-model', got %s", req.Model) + } + }, + }, + { + Name: "embed handler batch input", + Setup: func(t *testing.T, req *http.Request) { + body := EmbedRequest{ + Input: []string{"Hello", "World"}, + Model: "test-model", + } + prepareRequest(req, body) + }, + Expected: func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder) { + input, ok := req.Input.([]any) + + if !ok { + t.Fatalf("expected input to be a list") + } + + if input[0].(string) != "Hello" { + t.Fatalf("expected 'Hello', got %s", input[0]) + } + + if input[1].(string) != "World" { + t.Fatalf("expected 'World', got %s", input[1]) + } + + if req.Model != "test-model" { + t.Fatalf("expected 'test-model', got %s", req.Model) + } + }, + }, + { + Name: "embed handler error forwarding", + Setup: func(t *testing.T, req *http.Request) { + body := EmbedRequest{ + Model: "test-model", + } + prepareRequest(req, body) + }, + Expected: func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder) { + if resp.Code != http.StatusBadRequest { + t.Fatalf("expected 400, got %d", resp.Code) + } + + if !strings.Contains(resp.Body.String(), "invalid input") { + t.Fatalf("error was not forwarded") + } + }, + }, + } + + endpoint := func(c *gin.Context) { + c.Status(http.StatusOK) + } + + gin.SetMode(gin.TestMode) + router := gin.New() + router.Use(EmbeddingsMiddleware(), captureRequestMiddleware(&capturedRequest)) + router.Handle(http.MethodPost, "/api/embed", endpoint) + + for _, tc := range testCases { + t.Run(tc.Name, func(t *testing.T) { + req, _ := http.NewRequest(http.MethodPost, "/api/embed", nil) + + tc.Setup(t, req) + + resp := httptest.NewRecorder() + router.ServeHTTP(resp, req) + + tc.Expected(t, capturedRequest, resp) + + capturedRequest = nil }) } } @@ -154,36 +409,6 @@ func TestMiddlewareResponses(t *testing.T) { } testCases := []testCase{ - { - Name: "completions handler error forwarding", - Method: http.MethodPost, - Path: "/api/generate", - TestPath: "/api/generate", - Handler: CompletionsMiddleware, - Endpoint: func(c *gin.Context) { - c.JSON(http.StatusBadRequest, gin.H{"error": "invalid request"}) - }, - Setup: func(t *testing.T, req *http.Request) { - body := CompletionRequest{ - Model: "test-model", - Prompt: "Hello", - } - - bodyBytes, _ := json.Marshal(body) - - req.Body = io.NopCloser(bytes.NewReader(bodyBytes)) - req.Header.Set("Content-Type", "application/json") - }, - Expected: func(t *testing.T, resp *httptest.ResponseRecorder) { - if resp.Code != http.StatusBadRequest { - t.Fatalf("expected 400, got %d", resp.Code) - } - - if !strings.Contains(resp.Body.String(), `"invalid request"`) { - t.Fatalf("error was not forwarded") - } - }, - }, { Name: "list handler", Method: http.MethodGet, @@ -200,8 +425,6 @@ func TestMiddlewareResponses(t *testing.T) { }) }, Expected: func(t *testing.T, resp *httptest.ResponseRecorder) { - assert.Equal(t, http.StatusOK, resp.Code) - var listResp ListCompletion if err := json.NewDecoder(resp.Body).Decode(&listResp); err != nil { t.Fatal(err) @@ -265,6 +488,8 @@ func TestMiddlewareResponses(t *testing.T) { resp := httptest.NewRecorder() router.ServeHTTP(resp, req) + assert.Equal(t, http.StatusOK, resp.Code) + tc.Expected(t, resp) }) } diff --git a/parser/parser_test.go b/parser/parser_test.go index 2b5c4c88..48044bc0 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -451,6 +451,7 @@ func TestParseFileParameters(t *testing.T) { "num_predict 1": {"num_predict", "1"}, "top_k 1": {"top_k", "1"}, "top_p 1.0": {"top_p", "1.0"}, + "min_p 0.05": {"min_p", "0.05"}, "tfs_z 1.0": {"tfs_z", "1.0"}, "typical_p 1.0": {"typical_p", "1.0"}, "repeat_last_n 1": {"repeat_last_n", "1"}, diff --git a/scripts/install.sh b/scripts/install.sh index 226535ee..c356d0d0 100644 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -211,19 +211,29 @@ if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then exit 0 fi +CUDA_REPO_ERR_MSG="NVIDIA GPU detected, but your OS and Architecture are not supported by NVIDIA. Please install the CUDA driver manually https://docs.nvidia.com/cuda/cuda-installation-guide-linux/" # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-7-centos-7 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-8-rocky-8 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-9-rocky-9 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#fedora install_cuda_driver_yum() { status 'Installing NVIDIA repository...' + case $PACKAGE_MANAGER in yum) $SUDO $PACKAGE_MANAGER -y install yum-utils - $SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo + if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo" >/dev/null ; then + $SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo + else + error $CUDA_REPO_ERR_MSG + fi ;; dnf) - $SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo + if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo" >/dev/null ; then + $SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo + else + error $CUDA_REPO_ERR_MSG + fi ;; esac @@ -248,7 +258,11 @@ install_cuda_driver_yum() { # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#debian install_cuda_driver_apt() { status 'Installing NVIDIA repository...' - curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb + if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb" >/dev/null ; then + curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb + else + error $CUDA_REPO_ERR_MSG + fi case $1 in debian) diff --git a/server/auth.go b/server/auth.go index e92a5b65..dcef5bf9 100644 --- a/server/auth.go +++ b/server/auth.go @@ -67,7 +67,7 @@ func getAuthorizationToken(ctx context.Context, challenge registryChallenge) (st headers.Add("Authorization", signature) - response, err := makeRequest(ctx, http.MethodGet, redirectURL, headers, nil, nil) + response, err := makeRequest(ctx, http.MethodGet, redirectURL, headers, nil, ®istryOptions{}) if err != nil { return "", err } diff --git a/server/download.go b/server/download.go index d93cd3b4..45483ba6 100644 --- a/server/download.go +++ b/server/download.go @@ -8,6 +8,7 @@ import ( "io" "log/slog" "math" + "math/rand/v2" "net/http" "net/url" "os" @@ -43,17 +44,19 @@ type blobDownload struct { context.CancelFunc - done bool + done chan struct{} err error references atomic.Int32 } type blobDownloadPart struct { - N int - Offset int64 - Size int64 - Completed int64 - lastUpdated time.Time + N int + Offset int64 + Size int64 + Completed atomic.Int64 + + lastUpdatedMu sync.Mutex + lastUpdated time.Time *blobDownload `json:"-"` } @@ -71,7 +74,7 @@ func (p *blobDownloadPart) Name() string { } func (p *blobDownloadPart) StartsAt() int64 { - return p.Offset + p.Completed + return p.Offset + p.Completed.Load() } func (p *blobDownloadPart) StopsAt() int64 { @@ -81,7 +84,9 @@ func (p *blobDownloadPart) StopsAt() int64 { func (p *blobDownloadPart) Write(b []byte) (n int, err error) { n = len(b) p.blobDownload.Completed.Add(int64(n)) + p.lastUpdatedMu.Lock() p.lastUpdated = time.Now() + p.lastUpdatedMu.Unlock() return n, nil } @@ -91,6 +96,8 @@ func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *r return err } + b.done = make(chan struct{}) + for _, partFilePath := range partFilePaths { part, err := b.readPart(partFilePath) if err != nil { @@ -98,7 +105,7 @@ func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *r } b.Total += part.Size - b.Completed.Add(part.Completed) + b.Completed.Add(part.Completed.Load()) b.Parts = append(b.Parts, part) } @@ -138,9 +145,36 @@ func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *r } func (b *blobDownload) Run(ctx context.Context, requestURL *url.URL, opts *registryOptions) { + defer close(b.done) b.err = b.run(ctx, requestURL, opts) } +func newBackoff(maxBackoff time.Duration) func(ctx context.Context) error { + var n int + return func(ctx context.Context) error { + if ctx.Err() != nil { + return ctx.Err() + } + + n++ + + // n^2 backoff timer is a little smoother than the + // common choice of 2^n. + d := min(time.Duration(n*n)*10*time.Millisecond, maxBackoff) + // Randomize the delay between 0.5-1.5 x msec, in order + // to prevent accidental "thundering herd" problems. + d = time.Duration(float64(d) * (rand.Float64() + 0.5)) + t := time.NewTimer(d) + defer t.Stop() + select { + case <-ctx.Done(): + return ctx.Err() + case <-t.C: + return nil + } + } +} + func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *registryOptions) error { defer blobDownloadManager.Delete(b.Digest) ctx, b.CancelFunc = context.WithCancel(ctx) @@ -153,11 +187,57 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis _ = file.Truncate(b.Total) + directURL, err := func() (*url.URL, error) { + ctx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + + backoff := newBackoff(10 * time.Second) + for { + // shallow clone opts to be used in the closure + // without affecting the outer opts. + newOpts := new(registryOptions) + *newOpts = *opts + + newOpts.CheckRedirect = func(req *http.Request, via []*http.Request) error { + if len(via) > 10 { + return errors.New("maxium redirects exceeded (10) for directURL") + } + + // if the hostname is the same, allow the redirect + if req.URL.Hostname() == requestURL.Hostname() { + return nil + } + + // stop at the first redirect that is not + // the same hostname as the original + // request. + return http.ErrUseLastResponse + } + + resp, err := makeRequestWithRetry(ctx, http.MethodGet, requestURL, nil, nil, newOpts) + if err != nil { + slog.Warn("failed to get direct URL; backing off and retrying", "err", err) + if err := backoff(ctx); err != nil { + return nil, err + } + continue + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusTemporaryRedirect { + return nil, fmt.Errorf("unexpected status code %d", resp.StatusCode) + } + return resp.Location() + } + }() + if err != nil { + return err + } + g, inner := errgroup.WithContext(ctx) g.SetLimit(numDownloadParts) for i := range b.Parts { part := b.Parts[i] - if part.Completed == part.Size { + if part.Completed.Load() == part.Size { continue } @@ -165,7 +245,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis var err error for try := 0; try < maxRetries; try++ { w := io.NewOffsetWriter(file, part.StartsAt()) - err = b.downloadChunk(inner, requestURL, w, part, opts) + err = b.downloadChunk(inner, directURL, w, part) switch { case errors.Is(err, context.Canceled), errors.Is(err, syscall.ENOSPC): // return immediately if the context is canceled or the device is out of space @@ -206,29 +286,31 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis return err } - b.done = true return nil } -func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w io.Writer, part *blobDownloadPart, opts *registryOptions) error { +func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w io.Writer, part *blobDownloadPart) error { g, ctx := errgroup.WithContext(ctx) g.Go(func() error { - headers := make(http.Header) - headers.Set("Range", fmt.Sprintf("bytes=%d-%d", part.StartsAt(), part.StopsAt()-1)) - resp, err := makeRequestWithRetry(ctx, http.MethodGet, requestURL, headers, nil, opts) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, requestURL.String(), nil) + if err != nil { + return err + } + req.Header.Set("Range", fmt.Sprintf("bytes=%d-%d", part.StartsAt(), part.StopsAt()-1)) + resp, err := http.DefaultClient.Do(req) if err != nil { return err } defer resp.Body.Close() - n, err := io.CopyN(w, io.TeeReader(resp.Body, part), part.Size-part.Completed) + n, err := io.CopyN(w, io.TeeReader(resp.Body, part), part.Size-part.Completed.Load()) if err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, io.ErrUnexpectedEOF) { // rollback progress b.Completed.Add(-n) return err } - part.Completed += n + part.Completed.Add(n) if err := b.writePart(part.Name(), part); err != nil { return err } @@ -242,15 +324,21 @@ func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w for { select { case <-ticker.C: - if part.Completed >= part.Size { + if part.Completed.Load() >= part.Size { return nil } - if !part.lastUpdated.IsZero() && time.Since(part.lastUpdated) > 5*time.Second { + part.lastUpdatedMu.Lock() + lastUpdated := part.lastUpdated + part.lastUpdatedMu.Unlock() + + if !lastUpdated.IsZero() && time.Since(lastUpdated) > 5*time.Second { const msg = "%s part %d stalled; retrying. If this persists, press ctrl-c to exit, then 'ollama pull' to find a faster connection." slog.Info(fmt.Sprintf(msg, b.Digest[7:19], part.N)) // reset last updated + part.lastUpdatedMu.Lock() part.lastUpdated = time.Time{} + part.lastUpdatedMu.Unlock() return errPartStalled } case <-ctx.Done(): @@ -315,6 +403,8 @@ func (b *blobDownload) Wait(ctx context.Context, fn func(api.ProgressResponse)) ticker := time.NewTicker(60 * time.Millisecond) for { select { + case <-b.done: + return b.err case <-ticker.C: fn(api.ProgressResponse{ Status: fmt.Sprintf("pulling %s", b.Digest[7:19]), @@ -322,10 +412,6 @@ func (b *blobDownload) Wait(ctx context.Context, fn func(api.ProgressResponse)) Total: b.Total, Completed: b.Completed.Load(), }) - - if b.done || b.err != nil { - return b.err - } case <-ctx.Done(): return ctx.Err() } diff --git a/server/images.go b/server/images.go index 688d5dca..836dbcc2 100644 --- a/server/images.go +++ b/server/images.go @@ -34,17 +34,28 @@ import ( "github.com/ollama/ollama/version" ) -var errCapabilityCompletion = errors.New("completion") +var ( + errCapabilities = errors.New("does not support") + errCapabilityCompletion = errors.New("completion") + errCapabilityTools = errors.New("tools") + errCapabilityInsert = errors.New("insert") +) type Capability string -const CapabilityCompletion = Capability("completion") +const ( + CapabilityCompletion = Capability("completion") + CapabilityTools = Capability("tools") + CapabilityInsert = Capability("insert") +) type registryOptions struct { Insecure bool Username string Password string Token string + + CheckRedirect func(req *http.Request, via []*http.Request) error } type Model struct { @@ -88,6 +99,15 @@ func (m *Model) CheckCapabilities(caps ...Capability) error { if _, ok := ggml.KV()[fmt.Sprintf("%s.pooling_type", ggml.KV().Architecture())]; ok { errs = append(errs, errCapabilityCompletion) } + case CapabilityTools: + if !slices.Contains(m.Template.Vars(), "tools") { + errs = append(errs, errCapabilityTools) + } + case CapabilityInsert: + vars := m.Template.Vars() + if !slices.Contains(vars, "suffix") { + errs = append(errs, errCapabilityInsert) + } default: slog.Error("unknown capability", "capability", cap) return fmt.Errorf("unknown capability: %s", cap) @@ -95,7 +115,7 @@ func (m *Model) CheckCapabilities(caps ...Capability) error { } if err := errors.Join(errs...); err != nil { - return fmt.Errorf("missing capabilities: %w", errors.Join(errs...)) + return fmt.Errorf("%w %w", errCapabilities, errors.Join(errs...)) } return nil @@ -474,6 +494,12 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio layers = append(layers, baseLayer.Layer) } case "license", "template", "system": + if c.Name == "template" { + if _, err := template.Parse(c.Args); err != nil { + return fmt.Errorf("%w: %s", errBadTemplate, err) + } + } + if c.Name != "license" { // replace layers = slices.DeleteFunc(layers, func(layer *Layer) bool { @@ -1107,7 +1133,9 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header req.ContentLength = contentLength } - resp, err := http.DefaultClient.Do(req) + resp, err := (&http.Client{ + CheckRedirect: regOpts.CheckRedirect, + }).Do(req) if err != nil { return nil, err } diff --git a/server/model.go b/server/model.go index a79f549a..c6d3078f 100644 --- a/server/model.go +++ b/server/model.go @@ -4,6 +4,7 @@ import ( "archive/zip" "bytes" "context" + "encoding/json" "errors" "fmt" "io" @@ -11,6 +12,9 @@ import ( "net/http" "os" "path/filepath" + "slices" + "strings" + "text/template/parse" "github.com/ollama/ollama/api" "github.com/ollama/ollama/convert" @@ -259,13 +263,27 @@ func detectChatTemplate(layers []*layerGGML) ([]*layerGGML, error) { if t, err := template.Named(s); err != nil { slog.Debug("template detection", "error", err) } else { - tmpl, err := NewLayer(t.Reader(), "application/vnd.ollama.image.template") + layer, err := NewLayer(t.Reader(), "application/vnd.ollama.image.template") if err != nil { return nil, err } - tmpl.status = fmt.Sprintf("using autodetected template %s", t.Name) - layers = append(layers, &layerGGML{tmpl, nil}) + layer.status = fmt.Sprintf("using autodetected template %s", t.Name) + layers = append(layers, &layerGGML{layer, nil}) + + if t.Parameters != nil { + var b bytes.Buffer + if err := json.NewEncoder(&b).Encode(t.Parameters); err != nil { + return nil, err + } + + layer, err := NewLayer(&b, "application/vnd.ollama.image.params") + if err != nil { + return nil, err + } + + layers = append(layers, &layerGGML{layer, nil}) + } } } } @@ -289,3 +307,113 @@ func detectContentType(r io.Reader) (string, error) { return "unknown", nil } + +// parseToolCalls attempts to parse a JSON string into a slice of ToolCalls. +// mxyng: this only really works if the input contains tool calls in some JSON format +func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) { + // create a subtree from the node that ranges over .ToolCalls + tmpl := m.Template.Subtree(func(n parse.Node) bool { + if t, ok := n.(*parse.RangeNode); ok { + return slices.Contains(template.Identifiers(t.Pipe), "ToolCalls") + } + + return false + }) + + if tmpl == nil { + return nil, false + } + + var b bytes.Buffer + if err := tmpl.Execute(&b, map[string][]api.ToolCall{ + "ToolCalls": { + { + Function: api.ToolCallFunction{ + Name: "@@name@@", + Arguments: api.ToolCallFunctionArguments{ + "@@argument@@": 1, + }, + }, + }, + }, + }); err != nil { + return nil, false + } + + var kv map[string]any + // execute the subtree with placeholders to identify the keys + // trim any commands that might exist in the template + if err := json.Unmarshal(bytes.TrimSuffix(b.Bytes(), []byte(",")), &kv); err != nil { + return nil, false + } + + // find the keys that correspond to the name and arguments fields + var name, arguments string + for k, v := range kv { + switch v.(type) { + case string: + name = k + case map[string]any: + arguments = k + } + } + + if name == "" || arguments == "" { + return nil, false + } + + var objs []map[string]any + for offset := 0; offset < len(s); { + var obj map[string]any + decoder := json.NewDecoder(strings.NewReader(s[offset:])) + if err := decoder.Decode(&obj); errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) { + break + } else if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) { + // skip over any syntax errors + offset += int(syntax.Offset) + } else if unmarshalType := &(json.UnmarshalTypeError{}); errors.As(err, &unmarshalType) { + // skip over any unmarshalable types + offset += int(unmarshalType.Offset) + } else if err != nil { + slog.Error("parseToolCalls", "error", err) + return nil, false + } else { + offset += int(decoder.InputOffset()) + + // collect all nested objects + var collect func(any) []map[string]any + collect = func(obj any) (all []map[string]any) { + switch o := obj.(type) { + case map[string]any: + all = append(all, o) + for _, v := range o { + all = append(all, collect(v)...) + } + case []any: + for _, v := range o { + all = append(all, collect(v)...) + } + } + + return all + } + objs = append(objs, collect(obj)...) + } + } + + var toolCalls []api.ToolCall + for _, kv := range objs { + n, nok := kv[name].(string) + a, aok := kv[arguments].(map[string]any) + if nok && aok { + toolCalls = append(toolCalls, api.ToolCall{ + Function: api.ToolCallFunction{ + Name: n, + Arguments: a, + }, + }) + } + } + + return toolCalls, len(toolCalls) > 0 +} diff --git a/server/model_test.go b/server/model_test.go index a383b7e7..5829adfc 100644 --- a/server/model_test.go +++ b/server/model_test.go @@ -3,7 +3,9 @@ package server import ( "archive/zip" "bytes" + "encoding/json" "errors" + "fmt" "io" "os" "path/filepath" @@ -11,7 +13,9 @@ import ( "strings" "testing" + "github.com/google/go-cmp/cmp" "github.com/ollama/ollama/api" + "github.com/ollama/ollama/template" ) func createZipFile(t *testing.T, name string) *os.File { @@ -110,3 +114,123 @@ func TestExtractFromZipFile(t *testing.T) { }) } } + +func readFile(t *testing.T, base, name string) *bytes.Buffer { + t.Helper() + + bts, err := os.ReadFile(filepath.Join(base, name)) + if err != nil { + t.Fatal(err) + } + + return bytes.NewBuffer(bts) +} + +func TestExecuteWithTools(t *testing.T) { + p := filepath.Join("testdata", "tools") + cases := []struct { + model string + output string + ok bool + }{ + {"mistral", `[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`, true}, + {"mistral", `[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}] + +The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`, true}, + {"mistral", `I'm not aware of that information. However, I can suggest searching for the weather using the "get_current_weather" function: + + [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`, true}, + {"mistral", " The weather in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.", false}, + {"command-r-plus", "Action: ```json" + ` +[ + { + "tool_name": "get_current_weather", + "parameters": { + "format": "fahrenheit", + "location": "San Francisco, CA" + } + }, + { + "tool_name": "get_current_weather", + "parameters": { + "format": "celsius", + "location": "Toronto, Canada" + } + } +] +` + "```", true}, + {"command-r-plus", " The weather in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.", false}, + {"firefunction", ` functools[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`, true}, + {"firefunction", " The weather in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.", false}, + {"llama3-groq-tool-use", ` +{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} +{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}} +`, true}, + {"xlam", `{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]}`, true}, + } + + var tools []api.Tool + if err := json.Unmarshal(readFile(t, p, "tools.json").Bytes(), &tools); err != nil { + t.Fatal(err) + } + + var messages []api.Message + if err := json.Unmarshal(readFile(t, p, "messages.json").Bytes(), &messages); err != nil { + t.Fatal(err) + } + + calls := []api.ToolCall{ + { + Function: api.ToolCallFunction{ + Name: "get_current_weather", + Arguments: api.ToolCallFunctionArguments{ + "format": "fahrenheit", + "location": "San Francisco, CA", + }, + }, + }, + { + Function: api.ToolCallFunction{ + Name: "get_current_weather", + Arguments: api.ToolCallFunctionArguments{ + "format": "celsius", + "location": "Toronto, Canada", + }, + }, + }, + } + + for _, tt := range cases { + t.Run(tt.model, func(t *testing.T) { + tmpl, err := template.Parse(readFile(t, p, fmt.Sprintf("%s.gotmpl", tt.model)).String()) + if err != nil { + t.Fatal(err) + } + + t.Run("template", func(t *testing.T) { + var actual bytes.Buffer + if err := tmpl.Execute(&actual, template.Values{Tools: tools, Messages: messages}); err != nil { + t.Fatal(err) + } + + if diff := cmp.Diff(actual.String(), readFile(t, p, fmt.Sprintf("%s.out", tt.model)).String()); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + }) + + t.Run("parse", func(t *testing.T) { + m := &Model{Template: tmpl} + actual, ok := m.parseToolCalls(tt.output) + if ok != tt.ok { + t.Fatalf("expected %t, got %t", tt.ok, ok) + } + + if tt.ok { + if diff := cmp.Diff(actual, calls); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + } + }) + }) + } +} diff --git a/server/prompt.go b/server/prompt.go index abc5e61e..be0d4969 100644 --- a/server/prompt.go +++ b/server/prompt.go @@ -15,7 +15,7 @@ type tokenizeFunc func(context.Context, string) ([]int, error) // chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn. // chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the // latest message and 2) system messages -func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message) (prompt string, images []llm.ImageData, _ error) { +func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) { var system []api.Message // always include the last message n := len(msgs) - 1 @@ -29,7 +29,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. } var b bytes.Buffer - if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...)}); err != nil { + if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...), Tools: tools}); err != nil { return "", nil, err } @@ -57,7 +57,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. // truncate any messages that do not fit into the context window var b bytes.Buffer - if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[n:]...)}); err != nil { + if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[n:]...), Tools: tools}); err != nil { return "", nil, err } diff --git a/server/prompt_test.go b/server/prompt_test.go index d8caf3ed..02d23785 100644 --- a/server/prompt_test.go +++ b/server/prompt_test.go @@ -3,7 +3,6 @@ package server import ( "bytes" "context" - "strings" "testing" "github.com/google/go-cmp/cmp" @@ -11,14 +10,6 @@ import ( "github.com/ollama/ollama/template" ) -func tokenize(_ context.Context, s string) (tokens []int, err error) { - for range strings.Fields(s) { - tokens = append(tokens, len(tokens)) - } - - return -} - func TestChatPrompt(t *testing.T) { type expect struct { prompt string @@ -192,15 +183,11 @@ func TestChatPrompt(t *testing.T) { t.Run(tt.name, func(t *testing.T) { model := Model{Template: tmpl, ProjectorPaths: []string{"vision"}} opts := api.Options{Runner: api.Runner{NumCtx: tt.limit}} - prompt, images, err := chatPrompt(context.TODO(), &model, tokenize, &opts, tt.msgs) + prompt, images, err := chatPrompt(context.TODO(), &model, mockRunner{}.Tokenize, &opts, tt.msgs, nil) if err != nil { t.Fatal(err) } - if tt.prompt != prompt { - t.Errorf("expected %q, got %q", tt.prompt, prompt) - } - if diff := cmp.Diff(prompt, tt.prompt); diff != "" { t.Errorf("mismatch (-got +want):\n%s", diff) } diff --git a/server/routes.go b/server/routes.go index 4059c7c5..a560f369 100644 --- a/server/routes.go +++ b/server/routes.go @@ -9,6 +9,7 @@ import ( "fmt" "io" "log/slog" + "math" "net" "net/http" "net/netip" @@ -55,6 +56,7 @@ func init() { } var errRequired = errors.New("is required") +var errBadTemplate = errors.New("template error") func modelOptions(model *Model, requestOpts map[string]interface{}) (api.Options, error) { opts := api.DefaultOptions() @@ -102,6 +104,7 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []Capabil } func (s *Server) GenerateHandler(c *gin.Context) { + checkpointStart := time.Now() var req api.GenerateRequest if err := c.ShouldBindJSON(&req); errors.Is(err, io.EOF) { c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"}) @@ -120,6 +123,10 @@ func (s *Server) GenerateHandler(c *gin.Context) { } caps := []Capability{CapabilityCompletion} + if req.Suffix != "" { + caps = append(caps, CapabilityInsert) + } + r, m, opts, err := s.scheduleRunner(c.Request.Context(), req.Model, caps, req.Options, req.KeepAlive) if errors.Is(err, errCapabilityCompletion) { c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%q does not support generate", req.Model)}) @@ -129,6 +136,8 @@ func (s *Server) GenerateHandler(c *gin.Context) { return } + checkpointLoaded := time.Now() + if req.Prompt == "" { c.JSON(http.StatusOK, api.GenerateResponse{ Model: req.Model, @@ -146,19 +155,6 @@ func (s *Server) GenerateHandler(c *gin.Context) { prompt := req.Prompt if !req.Raw { - var msgs []api.Message - if req.System != "" { - msgs = append(msgs, api.Message{Role: "system", Content: req.System}) - } else if m.System != "" { - msgs = append(msgs, api.Message{Role: "system", Content: m.System}) - } - - for _, i := range images { - msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]", i.ID)}) - } - - msgs = append(msgs, api.Message{Role: "user", Content: req.Prompt}) - tmpl := m.Template if req.Template != "" { tmpl, err = template.Parse(req.Template) @@ -179,7 +175,26 @@ func (s *Server) GenerateHandler(c *gin.Context) { b.WriteString(s) } - if err := tmpl.Execute(&b, template.Values{Messages: msgs}); err != nil { + var values template.Values + if req.Suffix != "" { + values.Prompt = prompt + values.Suffix = req.Suffix + } else { + var msgs []api.Message + if req.System != "" { + msgs = append(msgs, api.Message{Role: "system", Content: req.System}) + } else if m.System != "" { + msgs = append(msgs, api.Message{Role: "system", Content: m.System}) + } + + for _, i := range images { + msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]", i.ID)}) + } + + values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt}) + } + + if err := tmpl.Execute(&b, values); err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } @@ -191,26 +206,48 @@ func (s *Server) GenerateHandler(c *gin.Context) { ch := make(chan any) go func() { + // TODO (jmorganca): avoid building the response twice both here and below + var sb strings.Builder defer close(ch) if err := r.Completion(c.Request.Context(), llm.CompletionRequest{ Prompt: prompt, Images: images, Format: req.Format, Options: opts, - }, func(r llm.CompletionResponse) { - ch <- api.GenerateResponse{ + }, func(cr llm.CompletionResponse) { + res := api.GenerateResponse{ Model: req.Model, CreatedAt: time.Now().UTC(), - Response: r.Content, - Done: r.Done, - DoneReason: r.DoneReason, + Response: cr.Content, + Done: cr.Done, + DoneReason: cr.DoneReason, Metrics: api.Metrics{ - PromptEvalCount: r.PromptEvalCount, - PromptEvalDuration: r.PromptEvalDuration, - EvalCount: r.EvalCount, - EvalDuration: r.EvalDuration, + PromptEvalCount: cr.PromptEvalCount, + PromptEvalDuration: cr.PromptEvalDuration, + EvalCount: cr.EvalCount, + EvalDuration: cr.EvalDuration, }, } + + if _, err := sb.WriteString(cr.Content); err != nil { + ch <- gin.H{"error": err.Error()} + } + + if cr.Done { + res.TotalDuration = time.Since(checkpointStart) + res.LoadDuration = checkpointLoaded.Sub(checkpointStart) + + if !req.Raw { + tokens, err := r.Tokenize(c.Request.Context(), prompt+sb.String()) + if err != nil { + ch <- gin.H{"error": err.Error()} + return + } + res.Context = append(req.Context, tokens...) + } + } + + ch <- res }); err != nil { ch <- gin.H{"error": err.Error()} } @@ -246,6 +283,127 @@ func (s *Server) GenerateHandler(c *gin.Context) { streamResponse(c, ch) } +func (s *Server) EmbedHandler(c *gin.Context) { + checkpointStart := time.Now() + var req api.EmbedRequest + err := c.ShouldBindJSON(&req) + switch { + case errors.Is(err, io.EOF): + c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"}) + return + case err != nil: + c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + + truncate := true + + if req.Truncate != nil && !*req.Truncate { + truncate = false + } + + var input []string + + switch i := req.Input.(type) { + case string: + if len(i) > 0 { + input = append(input, i) + } + case []any: + for _, v := range i { + if _, ok := v.(string); !ok { + c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "invalid input type"}) + return + } + input = append(input, v.(string)) + } + default: + c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "invalid input type"}) + return + } + + if len(input) == 0 { + c.JSON(http.StatusOK, api.EmbedResponse{Model: req.Model, Embeddings: [][]float32{}}) + return + } + + r, m, opts, err := s.scheduleRunner(c.Request.Context(), req.Model, []Capability{}, req.Options, req.KeepAlive) + if err != nil { + handleScheduleError(c, req.Model, err) + return + } + + checkpointLoaded := time.Now() + + kvData, err := getKVData(m.ModelPath, false) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + for i, s := range input { + tokens, err := r.Tokenize(c.Request.Context(), s) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + ctxLen := min(opts.NumCtx, int(kvData.ContextLength())) + if len(tokens) > ctxLen { + if !truncate { + c.JSON(http.StatusBadRequest, gin.H{"error": "input length exceeds maximum context length"}) + return + } + + tokens = tokens[:ctxLen] + s, err = r.Detokenize(c.Request.Context(), tokens) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + } + + input[i] = s + } + embeddings, err := r.Embed(c.Request.Context(), input) + + if err != nil { + slog.Error("embedding generation failed", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"}) + return + } + + for i, e := range embeddings.Embedding { + embeddings.Embedding[i] = normalize(e) + } + + resp := api.EmbedResponse{ + Model: req.Model, + Embeddings: embeddings.Embedding, + TotalDuration: time.Since(checkpointStart), + LoadDuration: checkpointLoaded.Sub(checkpointStart), + PromptEvalCount: embeddings.PromptEvalCount, + } + c.JSON(http.StatusOK, resp) +} + +func normalize(vec []float32) []float32 { + var sum float32 + for _, v := range vec { + sum += v * v + } + + norm := float32(0.0) + if sum > 0 { + norm = float32(1.0 / math.Sqrt(float64(sum))) + } + + for i := range vec { + vec[i] *= norm + } + return vec +} + func (s *Server) EmbeddingsHandler(c *gin.Context) { var req api.EmbeddingRequest if err := c.ShouldBindJSON(&req); errors.Is(err, io.EOF) { @@ -268,14 +426,24 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) { return } - embedding, err := r.Embedding(c.Request.Context(), req.Prompt) + embeddings, err := r.Embed(c.Request.Context(), []string{req.Prompt}) + if err != nil { slog.Info(fmt.Sprintf("embedding generation failed: %v", err)) c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"}) return } - c.JSON(http.StatusOK, api.EmbeddingResponse{Embedding: embedding}) + embedding := make([]float64, len(embeddings.Embedding[0])) + + for i, v := range embeddings.Embedding[0] { + embedding[i] = float64(v) + } + + resp := api.EmbeddingResponse{ + Embedding: embedding, + } + c.JSON(http.StatusOK, resp) } func (s *Server) PullModelHandler(c *gin.Context) { @@ -447,7 +615,9 @@ func (s *Server) CreateModelHandler(c *gin.Context) { defer cancel() quantization := cmp.Or(r.Quantize, r.Quantization) - if err := CreateModel(ctx, name, filepath.Dir(r.Path), strings.ToUpper(quantization), f, fn); err != nil { + if err := CreateModel(ctx, name, filepath.Dir(r.Path), strings.ToUpper(quantization), f, fn); errors.Is(err, errBadTemplate) { + ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest} + } else if err != nil { ch <- gin.H{"error": err.Error()} } }() @@ -549,13 +719,6 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) { m.System = req.System } - if req.Template != "" { - m.Template, err = template.Parse(req.Template) - if err != nil { - return nil, err - } - } - msgs := make([]api.Message, len(m.Messages)) for i, msg := range m.Messages { msgs[i] = api.Message{Role: msg.Role, Content: msg.Content} @@ -901,6 +1064,7 @@ func (s *Server) GenerateRoutes() http.Handler { r.POST("/api/pull", s.PullModelHandler) r.POST("/api/generate", s.GenerateHandler) r.POST("/api/chat", s.ChatHandler) + r.POST("/api/embed", s.EmbedHandler) r.POST("/api/embeddings", s.EmbeddingsHandler) r.POST("/api/create", s.CreateModelHandler) r.POST("/api/push", s.PushModelHandler) @@ -914,6 +1078,7 @@ func (s *Server) GenerateRoutes() http.Handler { // Compatibility endpoints r.POST("/v1/chat/completions", openai.ChatMiddleware(), s.ChatHandler) r.POST("/v1/completions", openai.CompletionsMiddleware(), s.GenerateHandler) + r.POST("/v1/embeddings", openai.EmbeddingsMiddleware(), s.EmbedHandler) r.GET("/v1/models", openai.ListMiddleware(), s.ListModelsHandler) r.GET("/v1/models/:model", openai.RetrieveMiddleware(), s.ShowModelHandler) @@ -1040,11 +1205,15 @@ func waitForStream(c *gin.Context, ch chan interface{}) { return } case gin.H: + status, ok := r["status"].(int) + if !ok { + status = http.StatusInternalServerError + } if errorMsg, ok := r["error"].(string); ok { - c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg}) + c.JSON(status, gin.H{"error": errorMsg}) return } else { - c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in progress response"}) + c.JSON(status, gin.H{"error": "unexpected error format in progress response"}) return } default: @@ -1122,6 +1291,8 @@ func (s *Server) ProcessHandler(c *gin.Context) { } func (s *Server) ChatHandler(c *gin.Context) { + checkpointStart := time.Now() + var req api.ChatRequest if err := c.ShouldBindJSON(&req); errors.Is(err, io.EOF) { c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"}) @@ -1132,6 +1303,10 @@ func (s *Server) ChatHandler(c *gin.Context) { } caps := []Capability{CapabilityCompletion} + if len(req.Tools) > 0 { + caps = append(caps, CapabilityTools) + } + r, m, opts, err := s.scheduleRunner(c.Request.Context(), req.Model, caps, req.Options, req.KeepAlive) if errors.Is(err, errCapabilityCompletion) { c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%q does not support chat", req.Model)}) @@ -1141,6 +1316,8 @@ func (s *Server) ChatHandler(c *gin.Context) { return } + checkpointLoaded := time.Now() + if len(req.Messages) == 0 { c.JSON(http.StatusOK, api.ChatResponse{ Model: req.Model, @@ -1152,7 +1329,11 @@ func (s *Server) ChatHandler(c *gin.Context) { return } - prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, req.Messages) + if req.Messages[0].Role != "system" && m.System != "" { + req.Messages = append([]api.Message{{Role: "system", Content: m.System}}, req.Messages...) + } + + prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, req.Messages, req.Tools) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return @@ -1169,7 +1350,7 @@ func (s *Server) ChatHandler(c *gin.Context) { Format: req.Format, Options: opts, }, func(r llm.CompletionResponse) { - ch <- api.ChatResponse{ + res := api.ChatResponse{ Model: req.Model, CreatedAt: time.Now().UTC(), Message: api.Message{Role: "assistant", Content: r.Content}, @@ -1182,19 +1363,26 @@ func (s *Server) ChatHandler(c *gin.Context) { EvalDuration: r.EvalDuration, }, } + + if r.Done { + res.TotalDuration = time.Since(checkpointStart) + res.LoadDuration = checkpointLoaded.Sub(checkpointStart) + } + + ch <- res }); err != nil { ch <- gin.H{"error": err.Error()} } }() if req.Stream != nil && !*req.Stream { - var r api.ChatResponse + var resp api.ChatResponse var sb strings.Builder for rr := range ch { switch t := rr.(type) { case api.ChatResponse: sb.WriteString(t.Message.Content) - r = t + resp = t case gin.H: msg, ok := t["error"].(string) if !ok { @@ -1209,8 +1397,16 @@ func (s *Server) ChatHandler(c *gin.Context) { } } - r.Message.Content = sb.String() - c.JSON(http.StatusOK, r) + resp.Message.Content = sb.String() + + if len(req.Tools) > 0 { + if toolCalls, ok := m.parseToolCalls(sb.String()); ok { + resp.Message.ToolCalls = toolCalls + resp.Message.Content = "" + } + } + + c.JSON(http.StatusOK, resp) return } @@ -1219,7 +1415,7 @@ func (s *Server) ChatHandler(c *gin.Context) { func handleScheduleError(c *gin.Context, name string, err error) { switch { - case errors.Is(err, errRequired): + case errors.Is(err, errCapabilities), errors.Is(err, errRequired): c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) case errors.Is(err, context.Canceled): c.JSON(499, gin.H{"error": "request canceled"}) diff --git a/server/routes_create_test.go b/server/routes_create_test.go index 04174b92..e801a74f 100644 --- a/server/routes_create_test.go +++ b/server/routes_create_test.go @@ -85,6 +85,8 @@ func checkFileExists(t *testing.T, p string, expect []string) { } func TestCreateFromBin(t *testing.T) { + gin.SetMode(gin.TestMode) + p := t.TempDir() t.Setenv("OLLAMA_MODELS", p) envconfig.LoadConfig() @@ -111,6 +113,8 @@ func TestCreateFromBin(t *testing.T) { } func TestCreateFromModel(t *testing.T) { + gin.SetMode(gin.TestMode) + p := t.TempDir() t.Setenv("OLLAMA_MODELS", p) envconfig.LoadConfig() @@ -152,6 +156,8 @@ func TestCreateFromModel(t *testing.T) { } func TestCreateRemovesLayers(t *testing.T) { + gin.SetMode(gin.TestMode) + p := t.TempDir() t.Setenv("OLLAMA_MODELS", p) envconfig.LoadConfig() @@ -199,6 +205,8 @@ func TestCreateRemovesLayers(t *testing.T) { } func TestCreateUnsetsSystem(t *testing.T) { + gin.SetMode(gin.TestMode) + p := t.TempDir() t.Setenv("OLLAMA_MODELS", p) envconfig.LoadConfig() @@ -255,6 +263,8 @@ func TestCreateUnsetsSystem(t *testing.T) { } func TestCreateMergeParameters(t *testing.T) { + gin.SetMode(gin.TestMode) + p := t.TempDir() t.Setenv("OLLAMA_MODELS", p) envconfig.LoadConfig() @@ -358,6 +368,8 @@ func TestCreateMergeParameters(t *testing.T) { } func TestCreateReplacesMessages(t *testing.T) { + gin.SetMode(gin.TestMode) + p := t.TempDir() t.Setenv("OLLAMA_MODELS", p) envconfig.LoadConfig() @@ -434,6 +446,8 @@ func TestCreateReplacesMessages(t *testing.T) { } func TestCreateTemplateSystem(t *testing.T) { + gin.SetMode(gin.TestMode) + p := t.TempDir() t.Setenv("OLLAMA_MODELS", p) envconfig.LoadConfig() @@ -477,9 +491,47 @@ func TestCreateTemplateSystem(t *testing.T) { if string(system) != "Say bye!" { t.Errorf("expected \"Say bye!\", actual %s", system) } + + t.Run("incomplete template", func(t *testing.T) { + w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ + Name: "test", + Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt", createBinFile(t, nil, nil)), + Stream: &stream, + }) + + if w.Code != http.StatusBadRequest { + t.Fatalf("expected status code 400, actual %d", w.Code) + } + }) + + t.Run("template with unclosed if", func(t *testing.T) { + w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ + Name: "test", + Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ if .Prompt }}", createBinFile(t, nil, nil)), + Stream: &stream, + }) + + if w.Code != http.StatusBadRequest { + t.Fatalf("expected status code 400, actual %d", w.Code) + } + }) + + t.Run("template with undefined function", func(t *testing.T) { + w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ + Name: "test", + Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ Prompt }}", createBinFile(t, nil, nil)), + Stream: &stream, + }) + + if w.Code != http.StatusBadRequest { + t.Fatalf("expected status code 400, actual %d", w.Code) + } + }) } func TestCreateLicenses(t *testing.T) { + gin.SetMode(gin.TestMode) + p := t.TempDir() t.Setenv("OLLAMA_MODELS", p) envconfig.LoadConfig() @@ -526,6 +578,8 @@ func TestCreateLicenses(t *testing.T) { } func TestCreateDetectTemplate(t *testing.T) { + gin.SetMode(gin.TestMode) + p := t.TempDir() t.Setenv("OLLAMA_MODELS", p) envconfig.LoadConfig() @@ -545,9 +599,10 @@ func TestCreateDetectTemplate(t *testing.T) { } checkFileExists(t, filepath.Join(p, "blobs", "*"), []string{ + filepath.Join(p, "blobs", "sha256-0d79f567714c62c048378f2107fb332dabee0135d080c302d884317da9433cc5"), filepath.Join(p, "blobs", "sha256-553c4a3f747b3d22a4946875f1cc8ed011c2930d83f864a0c7265f9ec0a20413"), filepath.Join(p, "blobs", "sha256-c608dc615584cd20d9d830363dabf8a4783ae5d34245c3d8c115edb3bc7b28e4"), - filepath.Join(p, "blobs", "sha256-f836ee110db21567f826332e4cedd746c06d10664fd5a9ea3659e3683a944510"), + filepath.Join(p, "blobs", "sha256-ea34c57ba5b78b740aafe2aeb74dc6507fc3ad14170b64c26a04fb9e36c88d75"), }) }) diff --git a/server/routes_delete_test.go b/server/routes_delete_test.go index 00303bd1..33a97a73 100644 --- a/server/routes_delete_test.go +++ b/server/routes_delete_test.go @@ -8,12 +8,15 @@ import ( "path/filepath" "testing" + "github.com/gin-gonic/gin" "github.com/ollama/ollama/api" "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/types/model" ) func TestDelete(t *testing.T) { + gin.SetMode(gin.TestMode) + p := t.TempDir() t.Setenv("OLLAMA_MODELS", p) envconfig.LoadConfig() @@ -77,6 +80,8 @@ func TestDelete(t *testing.T) { } func TestDeleteDuplicateLayers(t *testing.T) { + gin.SetMode(gin.TestMode) + p := t.TempDir() t.Setenv("OLLAMA_MODELS", p) var s Server diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go new file mode 100644 index 00000000..5c0caff1 --- /dev/null +++ b/server/routes_generate_test.go @@ -0,0 +1,714 @@ +package server + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "strings" + "testing" + "time" + + "github.com/gin-gonic/gin" + "github.com/google/go-cmp/cmp" + + "github.com/ollama/ollama/api" + "github.com/ollama/ollama/gpu" + "github.com/ollama/ollama/llm" +) + +type mockRunner struct { + llm.LlamaServer + + // CompletionRequest is only valid until the next call to Completion + llm.CompletionRequest + llm.CompletionResponse +} + +func (m *mockRunner) Completion(_ context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error { + m.CompletionRequest = r + fn(m.CompletionResponse) + return nil +} + +func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error) { + for range strings.Fields(s) { + tokens = append(tokens, len(tokens)) + } + + return +} + +func newMockServer(mock *mockRunner) func(gpu.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) { + return func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { + return mock, nil + } +} + +func TestGenerateChat(t *testing.T) { + gin.SetMode(gin.TestMode) + + mock := mockRunner{ + CompletionResponse: llm.CompletionResponse{ + Done: true, + DoneReason: "stop", + PromptEvalCount: 1, + PromptEvalDuration: 1, + EvalCount: 1, + EvalDuration: 1, + }, + } + + s := Server{ + sched: &Scheduler{ + pendingReqCh: make(chan *LlmRequest, 1), + finishedReqCh: make(chan *LlmRequest, 1), + expiredCh: make(chan *runnerRef, 1), + unloadedCh: make(chan any, 1), + loaded: make(map[string]*runnerRef), + newServerFn: newMockServer(&mock), + getGpuFn: gpu.GetGPUInfo, + getCpuFn: gpu.GetCPUInfo, + reschedDelay: 250 * time.Millisecond, + loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) { + // add small delay to simulate loading + time.Sleep(time.Millisecond) + req.successCh <- &runnerRef{ + llama: &mock, + } + }, + }, + } + + go s.sched.Run(context.TODO()) + + w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ + Model: "test", + Modelfile: fmt.Sprintf(`FROM %s + TEMPLATE """ +{{- if .System }}System: {{ .System }} {{ end }} +{{- if .Prompt }}User: {{ .Prompt }} {{ end }} +{{- if .Response }}Assistant: {{ .Response }} {{ end }}""" +`, createBinFile(t, llm.KV{ + "general.architecture": "llama", + "llama.block_count": uint32(1), + "llama.context_length": uint32(8192), + "llama.embedding_length": uint32(4096), + "llama.attention.head_count": uint32(32), + "llama.attention.head_count_kv": uint32(8), + "tokenizer.ggml.tokens": []string{""}, + "tokenizer.ggml.scores": []float32{0}, + "tokenizer.ggml.token_type": []int32{0}, + }, []llm.Tensor{ + {Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + })), + Stream: &stream, + }) + + if w.Code != http.StatusOK { + t.Fatalf("expected status 200, got %d", w.Code) + } + + t.Run("missing body", func(t *testing.T) { + w := createRequest(t, s.ChatHandler, nil) + if w.Code != http.StatusBadRequest { + t.Errorf("expected status 400, got %d", w.Code) + } + + if diff := cmp.Diff(w.Body.String(), `{"error":"model is required"}`); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + }) + + t.Run("missing model", func(t *testing.T) { + w := createRequest(t, s.ChatHandler, api.ChatRequest{}) + if w.Code != http.StatusBadRequest { + t.Errorf("expected status 400, got %d", w.Code) + } + + if diff := cmp.Diff(w.Body.String(), `{"error":"model is required"}`); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + }) + + t.Run("missing capabilities chat", func(t *testing.T) { + w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ + Model: "bert", + Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{ + "general.architecture": "bert", + "bert.pooling_type": uint32(0), + }, []llm.Tensor{})), + Stream: &stream, + }) + + if w.Code != http.StatusOK { + t.Fatalf("expected status 200, got %d", w.Code) + } + + w = createRequest(t, s.ChatHandler, api.ChatRequest{ + Model: "bert", + }) + + if w.Code != http.StatusBadRequest { + t.Errorf("expected status 400, got %d", w.Code) + } + + if diff := cmp.Diff(w.Body.String(), `{"error":"\"bert\" does not support chat"}`); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + }) + + t.Run("load model", func(t *testing.T) { + w := createRequest(t, s.ChatHandler, api.ChatRequest{ + Model: "test", + }) + + if w.Code != http.StatusOK { + t.Errorf("expected status 200, got %d", w.Code) + } + + var actual api.ChatResponse + if err := json.NewDecoder(w.Body).Decode(&actual); err != nil { + t.Fatal(err) + } + + if actual.Model != "test" { + t.Errorf("expected model test, got %s", actual.Model) + } + + if !actual.Done { + t.Errorf("expected done true, got false") + } + + if actual.DoneReason != "load" { + t.Errorf("expected done reason load, got %s", actual.DoneReason) + } + }) + + checkChatResponse := func(t *testing.T, body io.Reader, model, content string) { + t.Helper() + + var actual api.ChatResponse + if err := json.NewDecoder(body).Decode(&actual); err != nil { + t.Fatal(err) + } + + if actual.Model != model { + t.Errorf("expected model test, got %s", actual.Model) + } + + if !actual.Done { + t.Errorf("expected done false, got true") + } + + if actual.DoneReason != "stop" { + t.Errorf("expected done reason stop, got %s", actual.DoneReason) + } + + if diff := cmp.Diff(actual.Message, api.Message{ + Role: "assistant", + Content: content, + }); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + + if actual.PromptEvalCount == 0 { + t.Errorf("expected prompt eval count > 0, got 0") + } + + if actual.PromptEvalDuration == 0 { + t.Errorf("expected prompt eval duration > 0, got 0") + } + + if actual.EvalCount == 0 { + t.Errorf("expected eval count > 0, got 0") + } + + if actual.EvalDuration == 0 { + t.Errorf("expected eval duration > 0, got 0") + } + + if actual.LoadDuration == 0 { + t.Errorf("expected load duration > 0, got 0") + } + + if actual.TotalDuration == 0 { + t.Errorf("expected total duration > 0, got 0") + } + } + + mock.CompletionResponse.Content = "Hi!" + t.Run("messages", func(t *testing.T) { + w := createRequest(t, s.ChatHandler, api.ChatRequest{ + Model: "test", + Messages: []api.Message{ + {Role: "user", Content: "Hello!"}, + }, + Stream: &stream, + }) + + if w.Code != http.StatusOK { + t.Errorf("expected status 200, got %d", w.Code) + } + + if diff := cmp.Diff(mock.CompletionRequest.Prompt, "User: Hello! "); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + + checkChatResponse(t, w.Body, "test", "Hi!") + }) + + w = createRequest(t, s.CreateModelHandler, api.CreateRequest{ + Model: "test-system", + Modelfile: "FROM test\nSYSTEM You are a helpful assistant.", + }) + + if w.Code != http.StatusOK { + t.Fatalf("expected status 200, got %d", w.Code) + } + + t.Run("messages with model system", func(t *testing.T) { + w := createRequest(t, s.ChatHandler, api.ChatRequest{ + Model: "test-system", + Messages: []api.Message{ + {Role: "user", Content: "Hello!"}, + }, + Stream: &stream, + }) + + if w.Code != http.StatusOK { + t.Errorf("expected status 200, got %d", w.Code) + } + + if diff := cmp.Diff(mock.CompletionRequest.Prompt, "System: You are a helpful assistant. User: Hello! "); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + + checkChatResponse(t, w.Body, "test-system", "Hi!") + }) + + mock.CompletionResponse.Content = "Abra kadabra!" + t.Run("messages with system", func(t *testing.T) { + w := createRequest(t, s.ChatHandler, api.ChatRequest{ + Model: "test-system", + Messages: []api.Message{ + {Role: "system", Content: "You can perform magic tricks."}, + {Role: "user", Content: "Hello!"}, + }, + Stream: &stream, + }) + + if w.Code != http.StatusOK { + t.Errorf("expected status 200, got %d", w.Code) + } + + if diff := cmp.Diff(mock.CompletionRequest.Prompt, "System: You can perform magic tricks. User: Hello! "); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + + checkChatResponse(t, w.Body, "test-system", "Abra kadabra!") + }) + + t.Run("messages with interleaved system", func(t *testing.T) { + w := createRequest(t, s.ChatHandler, api.ChatRequest{ + Model: "test-system", + Messages: []api.Message{ + {Role: "user", Content: "Hello!"}, + {Role: "assistant", Content: "I can help you with that."}, + {Role: "system", Content: "You can perform magic tricks."}, + {Role: "user", Content: "Help me write tests."}, + }, + Stream: &stream, + }) + + if w.Code != http.StatusOK { + t.Errorf("expected status 200, got %d", w.Code) + } + + if diff := cmp.Diff(mock.CompletionRequest.Prompt, "System: You are a helpful assistant. User: Hello! Assistant: I can help you with that. System: You can perform magic tricks. User: Help me write tests. "); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + + checkChatResponse(t, w.Body, "test-system", "Abra kadabra!") + }) +} + +func TestGenerate(t *testing.T) { + gin.SetMode(gin.TestMode) + + mock := mockRunner{ + CompletionResponse: llm.CompletionResponse{ + Done: true, + DoneReason: "stop", + PromptEvalCount: 1, + PromptEvalDuration: 1, + EvalCount: 1, + EvalDuration: 1, + }, + } + + s := Server{ + sched: &Scheduler{ + pendingReqCh: make(chan *LlmRequest, 1), + finishedReqCh: make(chan *LlmRequest, 1), + expiredCh: make(chan *runnerRef, 1), + unloadedCh: make(chan any, 1), + loaded: make(map[string]*runnerRef), + newServerFn: newMockServer(&mock), + getGpuFn: gpu.GetGPUInfo, + getCpuFn: gpu.GetCPUInfo, + reschedDelay: 250 * time.Millisecond, + loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) { + // add small delay to simulate loading + time.Sleep(time.Millisecond) + req.successCh <- &runnerRef{ + llama: &mock, + } + }, + }, + } + + go s.sched.Run(context.TODO()) + + w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ + Model: "test", + Modelfile: fmt.Sprintf(`FROM %s + TEMPLATE """ +{{- if .System }}System: {{ .System }} {{ end }} +{{- if .Prompt }}User: {{ .Prompt }} {{ end }} +{{- if .Response }}Assistant: {{ .Response }} {{ end }}""" +`, createBinFile(t, llm.KV{ + "general.architecture": "llama", + "llama.block_count": uint32(1), + "llama.context_length": uint32(8192), + "llama.embedding_length": uint32(4096), + "llama.attention.head_count": uint32(32), + "llama.attention.head_count_kv": uint32(8), + "tokenizer.ggml.tokens": []string{""}, + "tokenizer.ggml.scores": []float32{0}, + "tokenizer.ggml.token_type": []int32{0}, + }, []llm.Tensor{ + {Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + })), + Stream: &stream, + }) + + if w.Code != http.StatusOK { + t.Fatalf("expected status 200, got %d", w.Code) + } + + t.Run("missing body", func(t *testing.T) { + w := createRequest(t, s.GenerateHandler, nil) + if w.Code != http.StatusBadRequest { + t.Errorf("expected status 400, got %d", w.Code) + } + + if diff := cmp.Diff(w.Body.String(), `{"error":"model is required"}`); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + }) + + t.Run("missing model", func(t *testing.T) { + w := createRequest(t, s.GenerateHandler, api.GenerateRequest{}) + if w.Code != http.StatusBadRequest { + t.Errorf("expected status 400, got %d", w.Code) + } + + if diff := cmp.Diff(w.Body.String(), `{"error":"model is required"}`); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + }) + + t.Run("missing capabilities generate", func(t *testing.T) { + w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ + Model: "bert", + Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{ + "general.architecture": "bert", + "bert.pooling_type": uint32(0), + }, []llm.Tensor{})), + Stream: &stream, + }) + + if w.Code != http.StatusOK { + t.Fatalf("expected status 200, got %d", w.Code) + } + + w = createRequest(t, s.GenerateHandler, api.GenerateRequest{ + Model: "bert", + }) + + if w.Code != http.StatusBadRequest { + t.Errorf("expected status 400, got %d", w.Code) + } + + if diff := cmp.Diff(w.Body.String(), `{"error":"\"bert\" does not support generate"}`); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + }) + + t.Run("missing capabilities suffix", func(t *testing.T) { + w := createRequest(t, s.GenerateHandler, api.GenerateRequest{ + Model: "test", + Prompt: "def add(", + Suffix: " return c", + }) + + if w.Code != http.StatusBadRequest { + t.Errorf("expected status 400, got %d", w.Code) + } + + if diff := cmp.Diff(w.Body.String(), `{"error":"test does not support insert"}`); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + }) + + t.Run("load model", func(t *testing.T) { + w := createRequest(t, s.GenerateHandler, api.GenerateRequest{ + Model: "test", + }) + + if w.Code != http.StatusOK { + t.Errorf("expected status 200, got %d", w.Code) + } + + var actual api.GenerateResponse + if err := json.NewDecoder(w.Body).Decode(&actual); err != nil { + t.Fatal(err) + } + + if actual.Model != "test" { + t.Errorf("expected model test, got %s", actual.Model) + } + + if !actual.Done { + t.Errorf("expected done true, got false") + } + + if actual.DoneReason != "load" { + t.Errorf("expected done reason load, got %s", actual.DoneReason) + } + }) + + checkGenerateResponse := func(t *testing.T, body io.Reader, model, content string) { + t.Helper() + + var actual api.GenerateResponse + if err := json.NewDecoder(body).Decode(&actual); err != nil { + t.Fatal(err) + } + + if actual.Model != model { + t.Errorf("expected model test, got %s", actual.Model) + } + + if !actual.Done { + t.Errorf("expected done false, got true") + } + + if actual.DoneReason != "stop" { + t.Errorf("expected done reason stop, got %s", actual.DoneReason) + } + + if actual.Response != content { + t.Errorf("expected response %s, got %s", content, actual.Response) + } + + if actual.Context == nil { + t.Errorf("expected context not nil") + } + + if actual.PromptEvalCount == 0 { + t.Errorf("expected prompt eval count > 0, got 0") + } + + if actual.PromptEvalDuration == 0 { + t.Errorf("expected prompt eval duration > 0, got 0") + } + + if actual.EvalCount == 0 { + t.Errorf("expected eval count > 0, got 0") + } + + if actual.EvalDuration == 0 { + t.Errorf("expected eval duration > 0, got 0") + } + + if actual.LoadDuration == 0 { + t.Errorf("expected load duration > 0, got 0") + } + + if actual.TotalDuration == 0 { + t.Errorf("expected total duration > 0, got 0") + } + } + + mock.CompletionResponse.Content = "Hi!" + t.Run("prompt", func(t *testing.T) { + w := createRequest(t, s.GenerateHandler, api.GenerateRequest{ + Model: "test", + Prompt: "Hello!", + Stream: &stream, + }) + + if w.Code != http.StatusOK { + t.Errorf("expected status 200, got %d", w.Code) + } + + if diff := cmp.Diff(mock.CompletionRequest.Prompt, "User: Hello! "); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + + checkGenerateResponse(t, w.Body, "test", "Hi!") + }) + + w = createRequest(t, s.CreateModelHandler, api.CreateRequest{ + Model: "test-system", + Modelfile: "FROM test\nSYSTEM You are a helpful assistant.", + }) + + if w.Code != http.StatusOK { + t.Fatalf("expected status 200, got %d", w.Code) + } + + t.Run("prompt with model system", func(t *testing.T) { + w := createRequest(t, s.GenerateHandler, api.GenerateRequest{ + Model: "test-system", + Prompt: "Hello!", + Stream: &stream, + }) + + if w.Code != http.StatusOK { + t.Errorf("expected status 200, got %d", w.Code) + } + + if diff := cmp.Diff(mock.CompletionRequest.Prompt, "System: You are a helpful assistant. User: Hello! "); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + + checkGenerateResponse(t, w.Body, "test-system", "Hi!") + }) + + mock.CompletionResponse.Content = "Abra kadabra!" + t.Run("prompt with system", func(t *testing.T) { + w := createRequest(t, s.GenerateHandler, api.GenerateRequest{ + Model: "test-system", + Prompt: "Hello!", + System: "You can perform magic tricks.", + Stream: &stream, + }) + + if w.Code != http.StatusOK { + t.Errorf("expected status 200, got %d", w.Code) + } + + if diff := cmp.Diff(mock.CompletionRequest.Prompt, "System: You can perform magic tricks. User: Hello! "); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + + checkGenerateResponse(t, w.Body, "test-system", "Abra kadabra!") + }) + + t.Run("prompt with template", func(t *testing.T) { + w := createRequest(t, s.GenerateHandler, api.GenerateRequest{ + Model: "test-system", + Prompt: "Help me write tests.", + System: "You can perform magic tricks.", + Template: `{{- if .System }}{{ .System }} {{ end }} +{{- if .Prompt }}### USER {{ .Prompt }} {{ end }} +{{- if .Response }}### ASSISTANT {{ .Response }} {{ end }}`, + Stream: &stream, + }) + + if w.Code != http.StatusOK { + t.Errorf("expected status 200, got %d", w.Code) + } + + if diff := cmp.Diff(mock.CompletionRequest.Prompt, "You can perform magic tricks. ### USER Help me write tests. "); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + + checkGenerateResponse(t, w.Body, "test-system", "Abra kadabra!") + }) + + w = createRequest(t, s.CreateModelHandler, api.CreateRequest{ + Model: "test-suffix", + Modelfile: `FROM test +TEMPLATE """{{- if .Suffix }}
 {{ .Prompt }} {{ .Suffix }} 
+{{- else }}{{ .Prompt }}
+{{- end }}"""`,
+	})
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected status 200, got %d", w.Code)
+	}
+
+	t.Run("prompt with suffix", func(t *testing.T) {
+		w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
+			Model:  "test-suffix",
+			Prompt: "def add(",
+			Suffix: "    return c",
+		})
+
+		if w.Code != http.StatusOK {
+			t.Errorf("expected status 200, got %d", w.Code)
+		}
+
+		if diff := cmp.Diff(mock.CompletionRequest.Prompt, "
 def add(     return c "); diff != "" {
+			t.Errorf("mismatch (-got +want):\n%s", diff)
+		}
+	})
+
+	t.Run("prompt without suffix", func(t *testing.T) {
+		w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
+			Model:  "test-suffix",
+			Prompt: "def add(",
+		})
+
+		if w.Code != http.StatusOK {
+			t.Errorf("expected status 200, got %d", w.Code)
+		}
+
+		if diff := cmp.Diff(mock.CompletionRequest.Prompt, "def add("); diff != "" {
+			t.Errorf("mismatch (-got +want):\n%s", diff)
+		}
+	})
+
+	t.Run("raw", func(t *testing.T) {
+		w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
+			Model:  "test-system",
+			Prompt: "Help me write tests.",
+			Raw:    true,
+			Stream: &stream,
+		})
+
+		if w.Code != http.StatusOK {
+			t.Errorf("expected status 200, got %d", w.Code)
+		}
+
+		if diff := cmp.Diff(mock.CompletionRequest.Prompt, "Help me write tests."); diff != "" {
+			t.Errorf("mismatch (-got +want):\n%s", diff)
+		}
+	})
+}
diff --git a/server/routes_list_test.go b/server/routes_list_test.go
index d04be9d6..c2d9c113 100644
--- a/server/routes_list_test.go
+++ b/server/routes_list_test.go
@@ -7,11 +7,14 @@ import (
 	"slices"
 	"testing"
 
+	"github.com/gin-gonic/gin"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
 )
 
 func TestList(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+
 	t.Setenv("OLLAMA_MODELS", t.TempDir())
 	envconfig.LoadConfig()
 
diff --git a/server/routes_test.go b/server/routes_test.go
index 50eaf7e9..97786ba2 100644
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -7,6 +7,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"io"
+	"math"
 	"net/http"
 	"net/http/httptest"
 	"os"
@@ -272,6 +273,77 @@ func Test_Routes(t *testing.T) {
 				assert.Equal(t, "library", retrieveResp.OwnedBy)
 			},
 		},
+		{
+			Name:   "Embed Handler Empty Input",
+			Method: http.MethodPost,
+			Path:   "/api/embed",
+			Setup: func(t *testing.T, req *http.Request) {
+				embedReq := api.EmbedRequest{
+					Model: "t-bone",
+					Input: "",
+				}
+				jsonData, err := json.Marshal(embedReq)
+				require.NoError(t, err)
+				req.Body = io.NopCloser(bytes.NewReader(jsonData))
+			},
+			Expected: func(t *testing.T, resp *http.Response) {
+				contentType := resp.Header.Get("Content-Type")
+				if contentType != "application/json; charset=utf-8" {
+					t.Fatalf("expected content type application/json; charset=utf-8, got %s", contentType)
+				}
+				body, err := io.ReadAll(resp.Body)
+				if err != nil {
+					t.Fatal(err)
+				}
+
+				var embedResp api.EmbedResponse
+				err = json.Unmarshal(body, &embedResp)
+				if err != nil {
+					t.Fatal(err)
+				}
+
+				if embedResp.Model != "t-bone" {
+					t.Fatalf("expected model t-bone, got %s", embedResp.Model)
+				}
+
+				if embedResp.Embeddings == nil {
+					t.Fatalf("expected embeddings to not be nil, got %v", embedResp.Embeddings)
+				}
+
+				if len(embedResp.Embeddings) != 0 {
+					t.Fatalf("expected embeddings to be empty, got %v", embedResp.Embeddings)
+				}
+			},
+		},
+		{
+			Name:   "Embed Handler Invalid Input",
+			Method: http.MethodPost,
+			Path:   "/api/embed",
+			Setup: func(t *testing.T, req *http.Request) {
+				embedReq := api.EmbedRequest{
+					Model: "t-bone",
+					Input: 2,
+				}
+				jsonData, err := json.Marshal(embedReq)
+				require.NoError(t, err)
+				req.Body = io.NopCloser(bytes.NewReader(jsonData))
+			},
+			Expected: func(t *testing.T, resp *http.Response) {
+				contentType := resp.Header.Get("Content-Type")
+				if contentType != "application/json; charset=utf-8" {
+					t.Fatalf("expected content type application/json; charset=utf-8, got %s", contentType)
+				}
+				_, err := io.ReadAll(resp.Body)
+
+				if err != nil {
+					t.Fatal(err)
+				}
+
+				if resp.StatusCode != http.StatusBadRequest {
+					t.Fatalf("expected status code 400, got %d", resp.StatusCode)
+				}
+			},
+		},
 	}
 
 	t.Setenv("OLLAMA_MODELS", t.TempDir())
@@ -420,3 +492,38 @@ func TestShow(t *testing.T) {
 		t.Fatal("Expected projector architecture to be 'clip', but got", resp.ProjectorInfo["general.architecture"])
 	}
 }
+
+func TestNormalize(t *testing.T) {
+	type testCase struct {
+		input []float32
+	}
+
+	testCases := []testCase{
+		{input: []float32{1}},
+		{input: []float32{0, 1, 2, 3}},
+		{input: []float32{0.1, 0.2, 0.3}},
+		{input: []float32{-0.1, 0.2, 0.3, -0.4}},
+		{input: []float32{0, 0, 0}},
+	}
+
+	isNormalized := func(vec []float32) (res bool) {
+		sum := 0.0
+		for _, v := range vec {
+			sum += float64(v * v)
+		}
+		if math.Abs(sum-1) > 1e-6 {
+			return sum == 0
+		} else {
+			return true
+		}
+	}
+
+	for _, tc := range testCases {
+		t.Run("", func(t *testing.T) {
+			normalized := normalize(tc.input)
+			if !isNormalized(normalized) {
+				t.Errorf("Vector %v is not normalized", tc.input)
+			}
+		})
+	}
+}
diff --git a/server/sched.go b/server/sched.go
index 2daed3ab..92b8d508 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -212,9 +212,12 @@ func (s *Scheduler) processPending(ctx context.Context) {
 					} else if loadedCount == 0 {
 						// No models loaded. Load the model but prefer the best fit.
 						slog.Debug("loading first model", "model", pending.model.ModelPath)
-						g := pickBestFitGPUs(pending, ggml, gpus, &numParallel)
+						g := pickBestFullFitByLibrary(pending, ggml, gpus, &numParallel)
 						if g != nil {
 							gpus = g
+						} else {
+							// Only allow partial loads when this is the first model
+							gpus = pickBestPartialFitByLibrary(pending, ggml, gpus, &numParallel)
 						}
 						s.loadFn(pending, ggml, gpus, numParallel)
 						break
@@ -231,7 +234,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 
 						// Update free memory from currently loaded models
 						s.updateFreeSpace(availGpus)
-						fitGpus := pickBestFitGPUs(pending, ggml, availGpus, &numParallel)
+						fitGpus := pickBestFullFitByLibrary(pending, ggml, availGpus, &numParallel)
 						if fitGpus != nil {
 							slog.Debug("new model fits with existing models, loading")
 							s.loadFn(pending, ggml, fitGpus, numParallel)
@@ -668,11 +671,12 @@ func (a ByDuration) Less(i, j int) bool {
 // func (a BySize) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 // func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM }
 
-// pickBestFitGPUs will try to find the optimal placement of the model in the available GPUs where the model fully fits
+// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
+// The list of GPUs returned will always be the same brand (library)
 // If the model can not be fit fully within the available GPU(s) nil is returned
 // If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
 // opts.NumCtx accordingly
-func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
+func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
 	var estimatedVRAM uint64
 
 	var numParallelToTry []int
@@ -723,6 +727,25 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
 	return nil
 }
 
+// If multiple Libraries are detected, pick the Library which loads the most layers for the model
+func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
+	*numParallel = 1
+	byLibrary := gpus.ByLibrary()
+	if len(byLibrary) <= 1 {
+		return gpus
+	}
+	var bestEstimate uint64
+	var bestFit int
+	for i, gl := range byLibrary {
+		_, estimatedVRAM := llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
+		if estimatedVRAM > bestEstimate {
+			bestEstimate = estimatedVRAM
+			bestFit = i
+		}
+	}
+	return byLibrary[bestFit]
+}
+
 // findRunnerToUnload finds a runner to unload to make room for a new model
 func (s *Scheduler) findRunnerToUnload() *runnerRef {
 	s.loadedMu.Lock()
diff --git a/server/sched_test.go b/server/sched_test.go
index 3fbd188a..4f8789fa 100644
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -94,7 +94,7 @@ func TestLoad(t *testing.T) {
 	require.Len(t, s.expiredCh, 1)
 }
 
-type bundle struct {
+type reqBundle struct {
 	ctx     context.Context //nolint:containedctx
 	ctxDone func()
 	srv     *mockLlm
@@ -102,13 +102,13 @@ type bundle struct {
 	ggml    *llm.GGML
 }
 
-func (scenario *bundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+func (scenario *reqBundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 	return scenario.srv, nil
 }
 
-func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64) *bundle {
-	scenario := &bundle{}
-	scenario.ctx, scenario.ctxDone = context.WithCancel(ctx)
+func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64, duration *api.Duration) *reqBundle {
+	b := &reqBundle{}
+	b.ctx, b.ctxDone = context.WithCancel(ctx)
 	t.Helper()
 
 	f, err := os.CreateTemp(t.TempDir(), modelName)
@@ -135,124 +135,154 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
 
 	fname := f.Name()
 	model := &Model{Name: modelName, ModelPath: fname}
-	scenario.ggml, err = llm.LoadModel(model.ModelPath, 0)
+	b.ggml, err = llm.LoadModel(model.ModelPath, 0)
 	require.NoError(t, err)
 
-	scenario.req = &LlmRequest{
-		ctx:             scenario.ctx,
+	if duration == nil {
+		duration = &api.Duration{Duration: 5 * time.Millisecond}
+	}
+	b.req = &LlmRequest{
+		ctx:             b.ctx,
 		model:           model,
 		opts:            api.DefaultOptions(),
-		sessionDuration: &api.Duration{Duration: 5 * time.Millisecond},
+		sessionDuration: duration,
 		successCh:       make(chan *runnerRef, 1),
 		errCh:           make(chan error, 1),
 	}
-	scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
-	return scenario
+	b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
+	return b
 }
 
-func TestRequests(t *testing.T) {
-	ctx, done := context.WithTimeout(context.Background(), 10*time.Second)
+func getGpuFn() gpu.GpuInfoList {
+	g := gpu.GpuInfo{Library: "metal"}
+	g.TotalMemory = 24 * format.GigaByte
+	g.FreeMemory = 12 * format.GigaByte
+	return []gpu.GpuInfo{g}
+}
+
+func getCpuFn() gpu.GpuInfoList {
+	g := gpu.GpuInfo{Library: "cpu"}
+	g.TotalMemory = 32 * format.GigaByte
+	g.FreeMemory = 26 * format.GigaByte
+	return []gpu.GpuInfo{g}
+}
+
+func TestRequestsSameModelSameRequest(t *testing.T) {
+	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
 	defer done()
-
-	// Same model, same request
-	scenario1a := newScenario(t, ctx, "ollama-model-1", 10)
-	scenario1a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
-	scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
-	scenario1b.req.model = scenario1a.req.model
-	scenario1b.ggml = scenario1a.ggml
-	scenario1b.req.sessionDuration = &api.Duration{Duration: 0}
-
-	// simple reload of same model
-	scenario2a := newScenario(t, ctx, "ollama-model-1", 20)
-	tmpModel := *scenario1a.req.model
-	scenario2a.req.model = &tmpModel
-	scenario2a.ggml = scenario1a.ggml
-	scenario2a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
-
-	// Multiple loaded models
-	scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
-	scenario3b := newScenario(t, ctx, "ollama-model-3b", 24*format.GigaByte)
-	scenario3c := newScenario(t, ctx, "ollama-model-4a", 30)
-	scenario3c.req.opts.NumGPU = 0                           // CPU load, will be allowed
-	scenario3d := newScenario(t, ctx, "ollama-model-3c", 30) // Needs prior unloaded
-
 	s := InitScheduler(ctx)
-	s.getGpuFn = func() gpu.GpuInfoList {
-		g := gpu.GpuInfo{Library: "metal"}
-		g.TotalMemory = 24 * format.GigaByte
-		g.FreeMemory = 12 * format.GigaByte
-		return []gpu.GpuInfo{g}
-	}
-	s.getCpuFn = func() gpu.GpuInfoList {
-		g := gpu.GpuInfo{Library: "cpu"}
-		g.TotalMemory = 32 * format.GigaByte
-		g.FreeMemory = 26 * format.GigaByte
-		return []gpu.GpuInfo{g}
-	}
-	s.newServerFn = scenario1a.newServer
-	slog.Info("scenario1a")
-	s.pendingReqCh <- scenario1a.req
+	s.getGpuFn = getGpuFn
+	s.getCpuFn = getCpuFn
+	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
+	b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0})
+	b.req.model = a.req.model
+	b.ggml = a.ggml
+
+	s.newServerFn = a.newServer
+	slog.Info("a")
+	s.pendingReqCh <- a.req
 	require.Len(t, s.pendingReqCh, 1)
 	s.Run(ctx)
 	select {
-	case resp := <-scenario1a.req.successCh:
-		require.Equal(t, resp.llama, scenario1a.srv)
+	case resp := <-a.req.successCh:
+		require.Equal(t, resp.llama, a.srv)
 		require.Empty(t, s.pendingReqCh)
-		require.Empty(t, scenario1a.req.errCh)
-	case err := <-scenario1a.req.errCh:
+		require.Empty(t, a.req.errCh)
+	case err := <-a.req.errCh:
 		t.Fatal(err.Error())
 	case <-ctx.Done():
 		t.Fatal("timeout")
 	}
 
 	// Same runner as first request due to not needing a reload
-	s.newServerFn = scenario1b.newServer
-	slog.Info("scenario1b")
-	s.pendingReqCh <- scenario1b.req
+	s.newServerFn = b.newServer
+	slog.Info("b")
+	s.pendingReqCh <- b.req
 	select {
-	case resp := <-scenario1b.req.successCh:
-		require.Equal(t, resp.llama, scenario1a.srv)
+	case resp := <-b.req.successCh:
+		require.Equal(t, resp.llama, a.srv)
 		require.Empty(t, s.pendingReqCh)
-		require.Empty(t, scenario1b.req.errCh)
-	case err := <-scenario1b.req.errCh:
+		require.Empty(t, b.req.errCh)
+	case err := <-b.req.errCh:
+		t.Fatal(err.Error())
+	case <-ctx.Done():
+		t.Fatal("timeout")
+	}
+}
+
+func TestRequestsSimpleReloadSameModel(t *testing.T) {
+	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
+	defer done()
+	s := InitScheduler(ctx)
+	s.getGpuFn = getGpuFn
+	s.getCpuFn = getCpuFn
+	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
+	b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond})
+	tmpModel := *a.req.model
+	b.req.model = &tmpModel
+	b.ggml = a.ggml
+
+	s.newServerFn = a.newServer
+	slog.Info("a")
+	s.pendingReqCh <- a.req
+	require.Len(t, s.pendingReqCh, 1)
+	s.Run(ctx)
+	select {
+	case resp := <-a.req.successCh:
+		require.Equal(t, resp.llama, a.srv)
+		require.Empty(t, s.pendingReqCh)
+		require.Empty(t, a.req.errCh)
+	case err := <-a.req.errCh:
 		t.Fatal(err.Error())
 	case <-ctx.Done():
 		t.Fatal("timeout")
 	}
 
 	// Trigger a reload
-	s.newServerFn = scenario2a.newServer
-	scenario2a.req.model.AdapterPaths = []string{"new"}
-	slog.Info("scenario2a")
-	s.pendingReqCh <- scenario2a.req
+	s.newServerFn = b.newServer
+	b.req.model.AdapterPaths = []string{"new"}
+	slog.Info("b")
+	s.pendingReqCh <- b.req
 	// finish first two requests, so model can reload
 	time.Sleep(1 * time.Millisecond)
-	scenario1a.ctxDone()
-	scenario1b.ctxDone()
+	a.ctxDone()
 	select {
-	case resp := <-scenario2a.req.successCh:
-		require.Equal(t, resp.llama, scenario2a.srv)
+	case resp := <-b.req.successCh:
+		require.Equal(t, resp.llama, b.srv)
 		require.Empty(t, s.pendingReqCh)
-		require.Empty(t, scenario2a.req.errCh)
-	case err := <-scenario2a.req.errCh:
+		require.Empty(t, b.req.errCh)
+	case err := <-b.req.errCh:
 		t.Fatal(err.Error())
 	case <-ctx.Done():
 		t.Fatal("timeout")
 	}
+}
+
+func TestRequestsMultipleLoadedModels(t *testing.T) {
+	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
+	defer done()
+	s := InitScheduler(ctx)
+	s.getGpuFn = getGpuFn
+	s.getCpuFn = getCpuFn
+
+	// Multiple loaded models
+	a := newScenarioRequest(t, ctx, "ollama-model-3a", 1*format.GigaByte, nil)
+	b := newScenarioRequest(t, ctx, "ollama-model-3b", 24*format.GigaByte, nil)
+	c := newScenarioRequest(t, ctx, "ollama-model-4a", 30, nil)
+	c.req.opts.NumGPU = 0                                       // CPU load, will be allowed
+	d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded
 
 	envconfig.MaxRunners = 1
-	s.newServerFn = scenario3a.newServer
-	slog.Info("scenario3a")
-	s.pendingReqCh <- scenario3a.req
-	// finish prior request, so new model can load
-	time.Sleep(1 * time.Millisecond)
-	scenario2a.ctxDone()
+	s.newServerFn = a.newServer
+	slog.Info("a")
+	s.pendingReqCh <- a.req
+	s.Run(ctx)
 	select {
-	case resp := <-scenario3a.req.successCh:
-		require.Equal(t, resp.llama, scenario3a.srv)
+	case resp := <-a.req.successCh:
+		require.Equal(t, resp.llama, a.srv)
 		require.Empty(t, s.pendingReqCh)
-		require.Empty(t, scenario3a.req.errCh)
-	case err := <-scenario3a.req.errCh:
+		require.Empty(t, a.req.errCh)
+	case err := <-a.req.errCh:
 		t.Fatal(err.Error())
 	case <-ctx.Done():
 		t.Fatal("timeout")
@@ -262,15 +292,15 @@ func TestRequests(t *testing.T) {
 	s.loadedMu.Unlock()
 
 	envconfig.MaxRunners = 0
-	s.newServerFn = scenario3b.newServer
-	slog.Info("scenario3b")
-	s.pendingReqCh <- scenario3b.req
+	s.newServerFn = b.newServer
+	slog.Info("b")
+	s.pendingReqCh <- b.req
 	select {
-	case resp := <-scenario3b.req.successCh:
-		require.Equal(t, resp.llama, scenario3b.srv)
+	case resp := <-b.req.successCh:
+		require.Equal(t, resp.llama, b.srv)
 		require.Empty(t, s.pendingReqCh)
-		require.Empty(t, scenario3b.req.errCh)
-	case err := <-scenario3b.req.errCh:
+		require.Empty(t, b.req.errCh)
+	case err := <-b.req.errCh:
 		t.Fatal(err.Error())
 	case <-ctx.Done():
 		t.Fatal("timeout")
@@ -280,15 +310,15 @@ func TestRequests(t *testing.T) {
 	s.loadedMu.Unlock()
 
 	// This is a CPU load with NumGPU = 0 so it should load
-	s.newServerFn = scenario3c.newServer
-	slog.Info("scenario3c")
-	s.pendingReqCh <- scenario3c.req
+	s.newServerFn = c.newServer
+	slog.Info("c")
+	s.pendingReqCh <- c.req
 	select {
-	case resp := <-scenario3c.req.successCh:
-		require.Equal(t, resp.llama, scenario3c.srv)
+	case resp := <-c.req.successCh:
+		require.Equal(t, resp.llama, c.srv)
 		require.Empty(t, s.pendingReqCh)
-		require.Empty(t, scenario3c.req.errCh)
-	case err := <-scenario3c.req.errCh:
+		require.Empty(t, c.req.errCh)
+	case err := <-c.req.errCh:
 		t.Fatal(err.Error())
 	case <-ctx.Done():
 		t.Fatal("timeout")
@@ -298,25 +328,25 @@ func TestRequests(t *testing.T) {
 	s.loadedMu.Unlock()
 
 	// Try to load a model that wont fit
-	s.newServerFn = scenario3d.newServer
-	slog.Info("scenario3d")
+	s.newServerFn = d.newServer
+	slog.Info("d")
 	s.loadedMu.Lock()
 	require.Len(t, s.loaded, 3)
 	s.loadedMu.Unlock()
-	scenario3a.ctxDone() // Won't help since this one isn't big enough to make room
+	a.ctxDone() // Won't help since this one isn't big enough to make room
 	time.Sleep(2 * time.Millisecond)
-	s.pendingReqCh <- scenario3d.req
+	s.pendingReqCh <- d.req
 	// finish prior request, so new model can load
 	time.Sleep(6 * time.Millisecond)
 	s.loadedMu.Lock()
 	require.Len(t, s.loaded, 2)
 	s.loadedMu.Unlock()
-	scenario3b.ctxDone()
+	b.ctxDone()
 	select {
-	case resp := <-scenario3d.req.successCh:
-		require.Equal(t, resp.llama, scenario3d.srv)
+	case resp := <-d.req.successCh:
+		require.Equal(t, resp.llama, d.srv)
 		require.Empty(t, s.pendingReqCh)
-		require.Empty(t, scenario3d.req.errCh)
+		require.Empty(t, d.req.errCh)
 	case <-ctx.Done():
 		t.Fatal("timeout")
 	}
@@ -329,26 +359,19 @@ func TestGetRunner(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer done()
 
-	scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
-	scenario1a.req.sessionDuration = &api.Duration{Duration: 0}
-	scenario1b := newScenario(t, ctx, "ollama-model-1b", 10)
-	scenario1b.req.sessionDuration = &api.Duration{Duration: 0}
-	scenario1c := newScenario(t, ctx, "ollama-model-1c", 10)
-	scenario1c.req.sessionDuration = &api.Duration{Duration: 0}
+	a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond})
+	b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond})
+	c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond})
 	envconfig.MaxQueuedRequests = 1
 	s := InitScheduler(ctx)
-	s.getGpuFn = func() gpu.GpuInfoList {
-		g := gpu.GpuInfo{Library: "metal"}
-		g.TotalMemory = 24 * format.GigaByte
-		g.FreeMemory = 12 * format.GigaByte
-		return []gpu.GpuInfo{g}
-	}
-	s.newServerFn = scenario1a.newServer
-	slog.Info("scenario1a")
-	successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
+	s.getGpuFn = getGpuFn
+	s.getCpuFn = getCpuFn
+	s.newServerFn = a.newServer
+	slog.Info("a")
+	successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
 	require.Len(t, s.pendingReqCh, 1)
-	slog.Info("scenario1b")
-	successCh1b, errCh1b := s.GetRunner(scenario1b.ctx, scenario1b.req.model, scenario1b.req.opts, scenario1b.req.sessionDuration)
+	slog.Info("b")
+	successCh1b, errCh1b := s.GetRunner(b.ctx, b.req.model, b.req.opts, b.req.sessionDuration)
 	require.Len(t, s.pendingReqCh, 1)
 	require.Empty(t, successCh1b)
 	require.Len(t, errCh1b, 1)
@@ -357,22 +380,24 @@ func TestGetRunner(t *testing.T) {
 	s.Run(ctx)
 	select {
 	case resp := <-successCh1a:
-		require.Equal(t, resp.llama, scenario1a.srv)
+		require.Equal(t, resp.llama, a.srv)
 		require.Empty(t, s.pendingReqCh)
 		require.Empty(t, errCh1a)
+	case err := <-errCh1a:
+		t.Fatal(err.Error())
 	case <-ctx.Done():
 		t.Fatal("timeout")
 	}
-	scenario1a.ctxDone()
+	a.ctxDone() // Set "a" model to idle so it can unload
 	s.loadedMu.Lock()
 	require.Len(t, s.loaded, 1)
 	s.loadedMu.Unlock()
 
-	scenario1c.req.model.ModelPath = "bad path"
-	slog.Info("scenario1c")
-	successCh1c, errCh1c := s.GetRunner(scenario1c.ctx, scenario1c.req.model, scenario1c.req.opts, scenario1c.req.sessionDuration)
+	c.req.model.ModelPath = "bad path"
+	slog.Info("c")
+	successCh1c, errCh1c := s.GetRunner(c.ctx, c.req.model, c.req.opts, c.req.sessionDuration)
 	// Starts in pending channel, then should be quickly processsed to return an error
-	time.Sleep(5 * time.Millisecond)
+	time.Sleep(20 * time.Millisecond) // Long enough for the "a" model to expire and unload
 	require.Empty(t, successCh1c)
 	s.loadedMu.Lock()
 	require.Empty(t, s.loaded)
@@ -380,7 +405,7 @@ func TestGetRunner(t *testing.T) {
 	require.Len(t, errCh1c, 1)
 	err = <-errCh1c
 	require.Contains(t, err.Error(), "bad path")
-	scenario1b.ctxDone()
+	b.ctxDone()
 }
 
 // TODO - add one scenario that triggers the bogus finished event with positive ref count
@@ -389,7 +414,7 @@ func TestPrematureExpired(t *testing.T) {
 	defer done()
 
 	// Same model, same request
-	scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
+	scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
 	s := InitScheduler(ctx)
 	s.getGpuFn = func() gpu.GpuInfoList {
 		g := gpu.GpuInfo{Library: "metal"}
@@ -411,6 +436,8 @@ func TestPrematureExpired(t *testing.T) {
 		s.loadedMu.Unlock()
 		slog.Info("sending premature expired event now")
 		s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
+	case err := <-errCh1a:
+		t.Fatal(err.Error())
 	case <-ctx.Done():
 		t.Fatal("timeout")
 	}
@@ -446,6 +473,8 @@ func TestUseLoadedRunner(t *testing.T) {
 	select {
 	case success := <-req.successCh:
 		require.Equal(t, r1, success)
+	case err := <-req.errCh:
+		t.Fatal(err.Error())
 	case <-ctx.Done():
 		t.Fatal("timeout")
 	}
@@ -625,8 +654,7 @@ func TestAlreadyCanceled(t *testing.T) {
 	defer done()
 	dctx, done2 := context.WithCancel(ctx)
 	done2()
-	scenario1a := newScenario(t, dctx, "ollama-model-1", 10)
-	scenario1a.req.sessionDuration = &api.Duration{Duration: 0}
+	scenario1a := newScenarioRequest(t, dctx, "ollama-model-1", 10, &api.Duration{Duration: 0})
 	s := InitScheduler(ctx)
 	slog.Info("scenario1a")
 	s.pendingReqCh <- scenario1a.req
@@ -638,12 +666,51 @@ func TestAlreadyCanceled(t *testing.T) {
 	require.Empty(t, scenario1a.req.successCh)
 }
 
+func TestHomogeneousGPUs(t *testing.T) {
+	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
+	defer done()
+	s := InitScheduler(ctx)
+
+	s.getGpuFn = func() gpu.GpuInfoList {
+		// Set memory values to require the model to be spread
+		gpus := []gpu.GpuInfo{
+			{Library: "cuda"},
+			{Library: "rocm"},
+		}
+		gpus[0].TotalMemory = 1 * format.GibiByte
+		gpus[0].FreeMemory = 256 * format.MebiByte
+		gpus[1].TotalMemory = 1 * format.GibiByte
+		gpus[1].FreeMemory = 256 * format.MebiByte
+		return gpus
+	}
+	s.getCpuFn = getCpuFn
+	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
+	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+		require.Len(t, gpus, 1)
+		return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
+	}
+	slog.Info("a")
+	s.pendingReqCh <- a.req
+	require.Len(t, s.pendingReqCh, 1)
+	s.Run(ctx)
+	select {
+	case resp := <-a.req.successCh:
+		require.Equal(t, resp.llama, a.srv)
+		require.Empty(t, s.pendingReqCh)
+		require.Empty(t, a.req.errCh)
+	case err := <-a.req.errCh:
+		t.Fatal(err.Error())
+	case <-ctx.Done():
+		t.Fatal("timeout")
+	}
+}
+
 type mockLlm struct {
 	pingResp           error
 	waitResp           error
 	completionResp     error
-	embeddingResp      []float64
-	embeddingRespErr   error
+	embedResp          *llm.EmbedResponse
+	embedRespErr       error
 	tokenizeResp       []int
 	tokenizeRespErr    error
 	detokenizeResp     string
@@ -660,8 +727,8 @@ func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitRes
 func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
 	return s.completionResp
 }
-func (s *mockLlm) Embedding(ctx context.Context, prompt string) ([]float64, error) {
-	return s.embeddingResp, s.embeddingRespErr
+func (s *mockLlm) Embed(ctx context.Context, input []string) (*llm.EmbedResponse, error) {
+	return s.embedResp, s.embedRespErr
 }
 func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
 	return s.tokenizeResp, s.tokenizeRespErr
diff --git a/server/testdata/tools/command-r-plus.gotmpl b/server/testdata/tools/command-r-plus.gotmpl
new file mode 100644
index 00000000..f30124e3
--- /dev/null
+++ b/server/testdata/tools/command-r-plus.gotmpl
@@ -0,0 +1,67 @@
+{{- if or .Tools .System }}<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
+{{- if .Tools }}# Safety Preamble
+The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.
+
+# System Preamble
+## Basic Rules
+You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.
+
+{{ if .System }}# User Preamble
+{{ .System }}
+{{- end }}
+
+## Available Tools
+Here is a list of tools that you have available to you:
+{{- range .Tools }}
+
+```python
+def {{ .Function.Name }}(
+{{- range $name, $property := .Function.Parameters.Properties }}{{ $name }}: {{ $property.Type }}, {{ end }}) -> List[Dict]:
+    """{{ .Function.Description }}
+
+{{- if .Function.Parameters.Properties }}
+
+    Args:
+{{- range $name, $property := .Function.Parameters.Properties }}
+        {{ $name }} ({{ $property.Type }}): {{ $property.Description }}
+{{- end }}
+{{- end }}
+    """
+    pass
+```
+{{- end }}
+{{- else if .System }}{{ .System }}
+{{- end }}<|END_OF_TURN_TOKEN|>
+{{- end }}
+{{- range .Messages }}
+{{- if eq .Role "system" }}
+{{- continue }}
+{{- end }}<|START_OF_TURN_TOKEN|>
+{{- if eq .Role "user" }}<|USER_TOKEN|>{{ .Content }}
+{{- else if eq .Role "assistant" }}<|CHATBOT_TOKEN|>
+{{- if .Content }}{{ .Content }}
+{{- else if .ToolCalls }}
+Action: ```json
+[
+{{- range .ToolCalls }}
+    {
+        "tool_name": "{{ .Function.Name }}",
+        "parameters": {{ .Function.Arguments }}
+    }
+{{- end }}
+]```
+{{ continue }}
+{{ end }}
+{{- else if eq .Role "tool" }}<|SYSTEM_TOKEN|>
+{{ .Content }}
+{{- end }}<|END_OF_TURN_TOKEN|>
+{{- end }}
+{{- if .Tools }}<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Write 'Action:' followed by a json-formatted list of actions that you want to perform in order to produce a good response to the user's last input. You can use any of the supplied tools any number of times, but you should aim to execute the minimum number of necessary actions for the input. You should use the `directly-answer` tool if calling the other tools is unnecessary. The list of actions you want to call should be formatted as a list of json objects, for example:
+```json
+[
+    {
+        "tool_name": title of the tool in the specification,
+        "parameters": a dict of parameters to input into the tool as they are defined in the specs, or {} if it takes no parameters
+    }
+]```
+{{- end }}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
\ No newline at end of file
diff --git a/server/testdata/tools/command-r-plus.out b/server/testdata/tools/command-r-plus.out
new file mode 100644
index 00000000..425af75a
--- /dev/null
+++ b/server/testdata/tools/command-r-plus.out
@@ -0,0 +1,39 @@
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># Safety Preamble
+The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.
+
+# System Preamble
+## Basic Rules
+You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.
+
+# User Preamble
+You are a knowledgable assistant. You can answer questions and perform tasks.
+
+## Available Tools
+Here is a list of tools that you have available to you:
+
+```python
+def get_current_weather(format: string, location: string, ) -> List[Dict]:
+    """Get the current weather
+
+    Args:
+        format (string): The temperature unit to use. Infer this from the users location.
+        location (string): The city and state, e.g. San Francisco, CA
+    """
+    pass
+```<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>What's the weather like today in Paris?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
+Action: ```json
+[
+    {
+        "tool_name": "get_current_weather",
+        "parameters": {"format":"celsius","location":"Paris, France"}
+    }
+]```
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
+22<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>The current temperature in Paris, France is 22 degrees Celsius.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>What's the weather like today in San Francisco and Toronto?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Write 'Action:' followed by a json-formatted list of actions that you want to perform in order to produce a good response to the user's last input. You can use any of the supplied tools any number of times, but you should aim to execute the minimum number of necessary actions for the input. You should use the `directly-answer` tool if calling the other tools is unnecessary. The list of actions you want to call should be formatted as a list of json objects, for example:
+```json
+[
+    {
+        "tool_name": title of the tool in the specification,
+        "parameters": a dict of parameters to input into the tool as they are defined in the specs, or {} if it takes no parameters
+    }
+]```<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
\ No newline at end of file
diff --git a/server/testdata/tools/firefunction.gotmpl b/server/testdata/tools/firefunction.gotmpl
new file mode 100644
index 00000000..312be205
--- /dev/null
+++ b/server/testdata/tools/firefunction.gotmpl
@@ -0,0 +1,31 @@
+{{- if or .System .Tools }}<|start_header_id|>system<|end_header_id|>
+{{- if .System }}
+{{ .System }}
+{{- end }}
+In addition to plain text responses, you can chose to call one or more of the provided functions.
+
+Use the following rule to decide when to call a function:
+  * if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so
+  * if you need external information that can be obtained by calling one or more of the provided functions, generate a function calls
+
+If you decide to call functions:
+  * prefix function calls with functools marker (no closing marker required)
+  * all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
+  * follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples
+  * respect the argument type formatting. E.g., if the type if number and format is float, write value 7 as 7.0
+  * make sure you pick the right functions that match the user intent
+
+Available functions as JSON spec:
+{{- if .Tools }}
+{{ .Tools }}
+{{- end }}<|eot_id|>
+{{- end }}
+{{- range .Messages }}<|start_header_id|>
+{{- if or (eq .Role "user") (eq .Role "assistant") (eq .Role "tool") }}{{ .Role }}
+{{- end }}<|end_header_id|>
+{{- if .Content }}{{ .Content }}
+{{- else if .ToolCalls }} functools[
+{{- range .ToolCalls }}{{ "{" }}"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}{{ "}" }}
+{{- end }}]
+{{- end }}<|eot_id|>
+{{- end }}<|start_header_id|>assistant<|end_header_id|>
\ No newline at end of file
diff --git a/server/testdata/tools/firefunction.out b/server/testdata/tools/firefunction.out
new file mode 100644
index 00000000..be50175e
--- /dev/null
+++ b/server/testdata/tools/firefunction.out
@@ -0,0 +1,17 @@
+<|start_header_id|>system<|end_header_id|>
+You are a knowledgable assistant. You can answer questions and perform tasks.
+In addition to plain text responses, you can chose to call one or more of the provided functions.
+
+Use the following rule to decide when to call a function:
+  * if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so
+  * if you need external information that can be obtained by calling one or more of the provided functions, generate a function calls
+
+If you decide to call functions:
+  * prefix function calls with functools marker (no closing marker required)
+  * all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
+  * follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples
+  * respect the argument type formatting. E.g., if the type if number and format is float, write value 7 as 7.0
+  * make sure you pick the right functions that match the user intent
+
+Available functions as JSON spec:
+[{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the users location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}]<|eot_id|><|start_header_id|><|end_header_id|>You are a knowledgable assistant. You can answer questions and perform tasks.<|eot_id|><|start_header_id|>user<|end_header_id|>What's the weather like today in Paris?<|eot_id|><|start_header_id|>assistant<|end_header_id|> functools[{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}]<|eot_id|><|start_header_id|>tool<|end_header_id|>22<|eot_id|><|start_header_id|>assistant<|end_header_id|>The current temperature in Paris, France is 22 degrees Celsius.<|eot_id|><|start_header_id|>user<|end_header_id|>What's the weather like today in San Francisco and Toronto?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
\ No newline at end of file
diff --git a/server/testdata/tools/llama3-groq-tool-use.gotmpl b/server/testdata/tools/llama3-groq-tool-use.gotmpl
new file mode 100644
index 00000000..45e9b462
--- /dev/null
+++ b/server/testdata/tools/llama3-groq-tool-use.gotmpl
@@ -0,0 +1,43 @@
+{{- if .Messages }}
+{{- if or .System .Tools }}<|start_header_id|>system<|end_header_id|>
+
+{{ .System }}
+{{- if .Tools }} You are provided with function signatures within  XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. For each function call return a json object with function name and arguments within  XML tags as follows:
+
+{"name": ,"arguments": }
+
+
+Here are the available tools:
+
+{{- range .Tools }} {{ .Function }}
+{{- end }} 
+{{- end }}
+{{- end }}<|eot_id|>
+{{- range .Messages }}
+{{- if ne .Role "system" }}<|start_header_id|>{{ .Role }}<|end_header_id|>
+
+{{ if eq .Role "user" }}{{ .Content }}
+{{- else if eq .Role "assistant" }}
+{{- if .Content }}{{ .Content }}
+{{- else if .ToolCalls }}
+{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
+{{- end }}
+
+{{- end }}
+{{- else if eq .Role "tool" }}
+{{ .Content }}
+
+{{- end }}<|eot_id|>
+{{- end }}
+{{- end }}<|start_header_id|>assistant<|end_header_id|>
+
+{{ else }}
+{{ if .System }}<|start_header_id|>system<|end_header_id|>
+
+{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
+
+{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
+
+{{ end }}{{ .Response }}
+{{- if .Response }}<|eot_id|>
+{{- end }}
\ No newline at end of file
diff --git a/server/testdata/tools/llama3-groq-tool-use.out b/server/testdata/tools/llama3-groq-tool-use.out
new file mode 100644
index 00000000..75a49558
--- /dev/null
+++ b/server/testdata/tools/llama3-groq-tool-use.out
@@ -0,0 +1,24 @@
+<|start_header_id|>system<|end_header_id|>
+
+You are a knowledgable assistant. You can answer questions and perform tasks. You are provided with function signatures within  XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. For each function call return a json object with function name and arguments within  XML tags as follows:
+
+{"name": ,"arguments": }
+
+
+Here are the available tools:
+ {"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the users location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}} <|eot_id|><|start_header_id|>user<|end_header_id|>
+
+What's the weather like today in Paris?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+
+{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}
+<|eot_id|><|start_header_id|>tool<|end_header_id|>
+
+
+22
+<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+The current temperature in Paris, France is 22 degrees Celsius.<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+What's the weather like today in San Francisco and Toronto?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
diff --git a/server/testdata/tools/messages.json b/server/testdata/tools/messages.json
new file mode 100644
index 00000000..1a3d1f56
--- /dev/null
+++ b/server/testdata/tools/messages.json
@@ -0,0 +1,39 @@
+[
+  {
+    "role": "system",
+    "content": "You are a knowledgable assistant. You can answer questions and perform tasks."
+  },
+  {
+    "role": "user",
+    "content": "What's the weather like today in Paris?"
+  },
+  {
+    "role": "assistant",
+    "tool_calls": [
+      {
+        "id": "89a1e453-0bce-4de3-a456-c54bed09c520",
+        "type": "function",
+        "function": {
+          "name": "get_current_weather",
+          "arguments": {
+            "location": "Paris, France",
+            "format": "celsius"
+          }
+        }
+      }
+    ]
+  },
+  {
+    "role": "tool",
+    "tool_call_id": "89a1e453-0bce-4de3-a456-c54bed09c520",
+    "content": "22"
+  },
+  {
+    "role": "assistant",
+    "content": "The current temperature in Paris, France is 22 degrees Celsius."
+  },
+  {
+    "role": "user",
+    "content": "What's the weather like today in San Francisco and Toronto?"
+  }
+]
diff --git a/server/testdata/tools/mistral.gotmpl b/server/testdata/tools/mistral.gotmpl
new file mode 100644
index 00000000..b08d6c2c
--- /dev/null
+++ b/server/testdata/tools/mistral.gotmpl
@@ -0,0 +1,15 @@
+{{- range $index, $_ := .Messages }}
+{{- if eq .Role "user" }}
+{{- if and (eq (len (slice $.Messages $index)) 1) $.Tools }}[AVAILABLE_TOOLS] {{ $.Tools }}[/AVAILABLE_TOOLS]
+{{- end }}[INST] {{ if and (eq (len (slice $.Messages $index)) 1) $.System }}{{ $.System }}
+
+{{ end }}{{ .Content }}[/INST]
+{{- else if eq .Role "assistant" }}
+{{- if .Content }} {{ .Content }}
+{{- else if .ToolCalls }}[TOOL_CALLS] [
+{{- range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
+{{- end }}]
+{{- end }}
+{{- else if eq .Role "tool" }}[TOOL_RESULTS] {"content": {{ .Content }}}[/TOOL_RESULTS]
+{{- end }}
+{{- end }}
\ No newline at end of file
diff --git a/server/testdata/tools/mistral.out b/server/testdata/tools/mistral.out
new file mode 100644
index 00000000..31d8cdd6
--- /dev/null
+++ b/server/testdata/tools/mistral.out
@@ -0,0 +1,3 @@
+[INST] What's the weather like today in Paris?[/INST][TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}][TOOL_RESULTS] {"content": 22}[/TOOL_RESULTS] The current temperature in Paris, France is 22 degrees Celsius.[AVAILABLE_TOOLS] [{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the users location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}][/AVAILABLE_TOOLS][INST] You are a knowledgable assistant. You can answer questions and perform tasks.
+
+What's the weather like today in San Francisco and Toronto?[/INST]
\ No newline at end of file
diff --git a/server/testdata/tools/tools.json b/server/testdata/tools/tools.json
new file mode 100644
index 00000000..17260bf8
--- /dev/null
+++ b/server/testdata/tools/tools.json
@@ -0,0 +1,30 @@
+[
+  {
+    "type": "function",
+    "function": {
+      "name": "get_current_weather",
+      "description": "Get the current weather",
+      "parameters": {
+        "type": "object",
+        "properties": {
+          "location": {
+            "type": "string",
+            "description": "The city and state, e.g. San Francisco, CA"
+          },
+          "format": {
+            "type": "string",
+            "enum": [
+              "celsius",
+              "fahrenheit"
+            ],
+            "description": "The temperature unit to use. Infer this from the users location."
+          }
+        },
+        "required": [
+          "location",
+          "format"
+        ]
+      }
+    }
+  }
+]
diff --git a/server/testdata/tools/xlam.gotmpl b/server/testdata/tools/xlam.gotmpl
new file mode 100644
index 00000000..51513d69
--- /dev/null
+++ b/server/testdata/tools/xlam.gotmpl
@@ -0,0 +1,45 @@
+{{- if .System }}{{ .System }}
+{{ end }}
+{{- range $i, $_ := .Messages }}
+{{- if eq .Role "user" }}### Instruction:
+{{- if and $.Tools (le (len (slice $.Messages $i)) 2) }}
+[BEGIN OF TASK INSTRUCTION]
+You are an expert in composing functions. You are given a question and a set of possible functions. 
+Based on the question, you will need to make one or more function/tool calls to achieve the purpose. 
+If none of the functions can be used, point it out and refuse to answer. 
+If the given question lacks the parameters required by the function, also point it out.
+[END OF TASK INSTRUCTION]
+
+[BEGIN OF AVAILABLE TOOLS]
+{{ $.Tools }}
+[END OF AVAILABLE TOOLS]
+
+[BEGIN OF FORMAT INSTRUCTION]
+The output MUST strictly adhere to the following JSON format, and NO other text MUST be included.
+The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make tool_calls an empty list '[]'.
+```
+{
+    "tool_calls": [
+    {"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},
+    ... (more tool calls as required)
+    ]
+}
+```
+[END OF FORMAT INSTRUCTION]
+
+[BEGIN OF QUERY]
+{{ .Content }}
+[END OF QUERY]
+
+
+{{ else }}
+{{ .Content }}
+{{ end }}
+{{- else if .ToolCalls }}### Response:
+{"tool_calls": [{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}{{ end }}]}
+<|EOT|>
+{{ else if eq .Role "assistant" }}### Response:
+{{ .Content }}
+<|EOT|>
+{{ end }}
+{{- end }}### Response:
\ No newline at end of file
diff --git a/server/testdata/tools/xlam.out b/server/testdata/tools/xlam.out
new file mode 100644
index 00000000..a4a9952f
--- /dev/null
+++ b/server/testdata/tools/xlam.out
@@ -0,0 +1,40 @@
+You are a knowledgable assistant. You can answer questions and perform tasks.
+### Instruction:
+What's the weather like today in Paris?
+### Response:
+{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}]}
+<|EOT|>
+### Response:
+The current temperature in Paris, France is 22 degrees Celsius.
+<|EOT|>
+### Instruction:
+[BEGIN OF TASK INSTRUCTION]
+You are an expert in composing functions. You are given a question and a set of possible functions. 
+Based on the question, you will need to make one or more function/tool calls to achieve the purpose. 
+If none of the functions can be used, point it out and refuse to answer. 
+If the given question lacks the parameters required by the function, also point it out.
+[END OF TASK INSTRUCTION]
+
+[BEGIN OF AVAILABLE TOOLS]
+[{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the users location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}]
+[END OF AVAILABLE TOOLS]
+
+[BEGIN OF FORMAT INSTRUCTION]
+The output MUST strictly adhere to the following JSON format, and NO other text MUST be included.
+The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make tool_calls an empty list '[]'.
+```
+{
+    "tool_calls": [
+    {"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},
+    ... (more tool calls as required)
+    ]
+}
+```
+[END OF FORMAT INSTRUCTION]
+
+[BEGIN OF QUERY]
+What's the weather like today in San Francisco and Toronto?
+[END OF QUERY]
+
+
+### Response:
\ No newline at end of file
diff --git a/server/upload.go b/server/upload.go
index 73ce78ce..c4078c22 100644
--- a/server/upload.go
+++ b/server/upload.go
@@ -254,7 +254,7 @@ func (b *blobUpload) uploadPart(ctx context.Context, method string, requestURL *
 
 		// retry uploading to the redirect URL
 		for try := range maxRetries {
-			err = b.uploadPart(ctx, http.MethodPut, redirectURL, part, nil)
+			err = b.uploadPart(ctx, http.MethodPut, redirectURL, part, ®istryOptions{})
 			switch {
 			case errors.Is(err, context.Canceled):
 				return err
diff --git a/template/alfred.json b/template/alfred.json
new file mode 100644
index 00000000..edac21af
--- /dev/null
+++ b/template/alfred.json
@@ -0,0 +1,8 @@
+{
+  "stop": [
+    "",
+    "",
+    "",
+    ""
+  ]
+}
diff --git a/template/alpaca.json b/template/alpaca.json
new file mode 100644
index 00000000..eafe2b8a
--- /dev/null
+++ b/template/alpaca.json
@@ -0,0 +1,6 @@
+{
+  "stop": [
+    "### Instruction:",
+    "### Response"
+  ]
+}
diff --git a/template/chatml.json b/template/chatml.json
new file mode 100644
index 00000000..7afeb3de
--- /dev/null
+++ b/template/chatml.json
@@ -0,0 +1,6 @@
+{
+  "stop": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ]
+}
diff --git a/template/chatqa.json b/template/chatqa.json
new file mode 100644
index 00000000..64dd0f33
--- /dev/null
+++ b/template/chatqa.json
@@ -0,0 +1,8 @@
+{
+  "stop": [
+    "System:",
+    "User:",
+    "Assistant:",
+    "<|begin_of_text|>"
+  ]
+}
diff --git a/template/codellama-70b-instruct.json b/template/codellama-70b-instruct.json
new file mode 100644
index 00000000..a56a63f1
--- /dev/null
+++ b/template/codellama-70b-instruct.json
@@ -0,0 +1,7 @@
+{
+  "stop": [
+    "Source:",
+    "Destination:",
+    ""
+  ]
+}
diff --git a/template/falcon-instruct.json b/template/falcon-instruct.json
new file mode 100644
index 00000000..a0da0e81
--- /dev/null
+++ b/template/falcon-instruct.json
@@ -0,0 +1,6 @@
+{
+  "stop": [
+    "User:",
+    "Assistant:"
+  ]
+}
diff --git a/template/gemma-instruct.json b/template/gemma-instruct.json
new file mode 100644
index 00000000..f4ad415c
--- /dev/null
+++ b/template/gemma-instruct.json
@@ -0,0 +1,6 @@
+{
+  "stop": [
+    "",
+    ""
+  ]
+}
diff --git a/template/granite-instruct.json b/template/granite-instruct.json
new file mode 100644
index 00000000..0933e4b5
--- /dev/null
+++ b/template/granite-instruct.json
@@ -0,0 +1,7 @@
+{
+  "stop": [
+    "System:",
+    "Question:",
+    "Answer:"
+  ]
+}
diff --git a/template/llama2-chat.json b/template/llama2-chat.json
new file mode 100644
index 00000000..17590ab4
--- /dev/null
+++ b/template/llama2-chat.json
@@ -0,0 +1,8 @@
+{
+  "stop": [
+    "[INST]",
+    "[/INST]",
+    "<>",
+    "<>"
+  ]
+}
diff --git a/template/llama3-instruct.json b/template/llama3-instruct.json
new file mode 100644
index 00000000..c4e9d448
--- /dev/null
+++ b/template/llama3-instruct.json
@@ -0,0 +1,7 @@
+{
+  "stop": [
+    "<|start_header_id|>",
+    "<|end_header_id|>",
+    "<|eot_id|>"
+  ]
+}
diff --git a/template/magicoder.json b/template/magicoder.json
new file mode 100644
index 00000000..6f67cab0
--- /dev/null
+++ b/template/magicoder.json
@@ -0,0 +1,6 @@
+{
+  "stop": [
+    "@@ Instruction",
+    "@@ Response"
+  ]
+}
diff --git a/template/mistral-instruct.json b/template/mistral-instruct.json
new file mode 100644
index 00000000..7afeb3de
--- /dev/null
+++ b/template/mistral-instruct.json
@@ -0,0 +1,6 @@
+{
+  "stop": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ]
+}
diff --git a/template/openchat.json b/template/openchat.json
new file mode 100644
index 00000000..0edc341f
--- /dev/null
+++ b/template/openchat.json
@@ -0,0 +1,5 @@
+{
+  "stop": [
+    "<|end_of_turn|>"
+  ]
+}
diff --git a/template/phi-3.json b/template/phi-3.json
new file mode 100644
index 00000000..27bf7664
--- /dev/null
+++ b/template/phi-3.json
@@ -0,0 +1,8 @@
+{
+  "stop": [
+    "<|end|>",
+    "<|system|>",
+    "<|user|>",
+    "<|assistant|>"
+  ]
+}
diff --git a/template/solar-instruct.json b/template/solar-instruct.json
new file mode 100644
index 00000000..7b7a9050
--- /dev/null
+++ b/template/solar-instruct.json
@@ -0,0 +1,7 @@
+{
+  "stop": [
+    "### System:",
+    "### User:",
+    "### Assistant"
+  ]
+}
diff --git a/template/starcoder2-instruct.json b/template/starcoder2-instruct.json
new file mode 100644
index 00000000..31348908
--- /dev/null
+++ b/template/starcoder2-instruct.json
@@ -0,0 +1,7 @@
+{
+  "stop": [
+    "### Instruction",
+    "### Response",
+    "<|endoftext|>"
+  ]
+}
diff --git a/template/template.go b/template/template.go
index 90014ec1..3e0afcd1 100644
--- a/template/template.go
+++ b/template/template.go
@@ -23,6 +23,7 @@ import (
 var indexBytes []byte
 
 //go:embed *.gotmpl
+//go:embed *.json
 var templatesFS embed.FS
 
 var templatesOnce = sync.OnceValues(func() ([]*named, error) {
@@ -39,6 +40,15 @@ var templatesOnce = sync.OnceValues(func() ([]*named, error) {
 
 		// normalize line endings
 		t.Bytes = bytes.ReplaceAll(bts, []byte("\r\n"), []byte("\n"))
+
+		params, err := templatesFS.ReadFile(t.Name + ".json")
+		if err != nil {
+			continue
+		}
+
+		if err := json.Unmarshal(params, &t.Parameters); err != nil {
+			return nil, err
+		}
 	}
 
 	return templates, nil
@@ -48,6 +58,10 @@ type named struct {
 	Name     string `json:"name"`
 	Template string `json:"template"`
 	Bytes    []byte
+
+	Parameters *struct {
+		Stop []string `json:"stop"`
+	}
 }
 
 func (t named) Reader() io.Reader {
@@ -102,8 +116,15 @@ var response = parse.ActionNode{
 	},
 }
 
+var funcs = template.FuncMap{
+	"json": func(v any) string {
+		b, _ := json.Marshal(v)
+		return string(b)
+	},
+}
+
 func Parse(s string) (*Template, error) {
-	tmpl := template.New("").Option("missingkey=zero")
+	tmpl := template.New("").Option("missingkey=zero").Funcs(funcs)
 
 	tmpl, err := tmpl.Parse(s)
 	if err != nil {
@@ -127,7 +148,7 @@ func (t *Template) Vars() []string {
 	var vars []string
 	for _, tt := range t.Templates() {
 		for _, n := range tt.Root.Nodes {
-			vars = append(vars, parseNode(n)...)
+			vars = append(vars, Identifiers(n)...)
 		}
 	}
 
@@ -143,17 +164,74 @@ func (t *Template) Vars() []string {
 
 type Values struct {
 	Messages []api.Message
+	api.Tools
+	Prompt string
+	Suffix string
 
 	// forceLegacy is a flag used to test compatibility with legacy templates
 	forceLegacy bool
 }
 
+func (t *Template) Subtree(fn func(parse.Node) bool) *template.Template {
+	var walk func(parse.Node) parse.Node
+	walk = func(n parse.Node) parse.Node {
+		if fn(n) {
+			return n
+		}
+
+		switch t := n.(type) {
+		case *parse.ListNode:
+			for _, c := range t.Nodes {
+				if n := walk(c); n != nil {
+					return n
+				}
+			}
+		case *parse.BranchNode:
+			for _, n := range []*parse.ListNode{t.List, t.ElseList} {
+				if n != nil {
+					if n := walk(n); n != nil {
+						return n
+					}
+				}
+			}
+		case *parse.IfNode:
+			return walk(&t.BranchNode)
+		case *parse.WithNode:
+			return walk(&t.BranchNode)
+		case *parse.RangeNode:
+			return walk(&t.BranchNode)
+		}
+
+		return nil
+	}
+
+	if n := walk(t.Tree.Root); n != nil {
+		return (&template.Template{
+			Tree: &parse.Tree{
+				Root: &parse.ListNode{
+					Nodes: []parse.Node{n},
+				},
+			},
+		}).Funcs(funcs)
+	}
+
+	return nil
+}
+
 func (t *Template) Execute(w io.Writer, v Values) error {
 	system, messages := collate(v.Messages)
-	if !v.forceLegacy && slices.Contains(t.Vars(), "messages") {
+	if v.Prompt != "" && v.Suffix != "" {
+		return t.Template.Execute(w, map[string]any{
+			"Prompt":   v.Prompt,
+			"Suffix":   v.Suffix,
+			"Response": "",
+		})
+	} else if !v.forceLegacy && slices.Contains(t.Vars(), "messages") {
 		return t.Template.Execute(w, map[string]any{
 			"System":   system,
 			"Messages": messages,
+			"Tools":    v.Tools,
+			"Response": "",
 		})
 	}
 
@@ -161,7 +239,7 @@ func (t *Template) Execute(w io.Writer, v Values) error {
 	var b bytes.Buffer
 	var prompt, response string
 	for _, m := range messages {
-		execute := func () error {
+		execute := func() error {
 			if err := t.Template.Execute(&b, map[string]any{
 				"System":   system,
 				"Prompt":   prompt,
@@ -198,12 +276,9 @@ func (t *Template) Execute(w io.Writer, v Values) error {
 
 	var cut bool
 	nodes := deleteNode(t.Template.Root.Copy(), func(n parse.Node) bool {
-		switch t := n.(type) {
-		case *parse.ActionNode:
-		case *parse.FieldNode:
-			if slices.Contains(t.Ident, "Response") {
-				cut = true
-			}
+		if field, ok := n.(*parse.FieldNode); ok && slices.Contains(field.Ident, "Response") {
+			cut = true
+			return false
 		}
 
 		return cut
@@ -211,8 +286,9 @@ func (t *Template) Execute(w io.Writer, v Values) error {
 
 	tree := parse.Tree{Root: nodes.(*parse.ListNode)}
 	if err := template.Must(template.New("").AddParseTree("", &tree)).Execute(&b, map[string]any{
-		"System": system,
-		"Prompt": prompt,
+		"System":   system,
+		"Prompt":   prompt,
+		"Response": response,
 	}); err != nil {
 		return err
 	}
@@ -255,50 +331,46 @@ func collate(msgs []api.Message) (string, []*api.Message) {
 	return strings.Join(system, "\n\n"), collated
 }
 
-func parseNode(n parse.Node) []string {
+// Identifiers walks the node tree returning any identifiers it finds along the way
+func Identifiers(n parse.Node) []string {
 	switch n := n.(type) {
+	case *parse.ListNode:
+		var names []string
+		for _, n := range n.Nodes {
+			names = append(names, Identifiers(n)...)
+		}
+
+		return names
+	case *parse.TemplateNode:
+		return Identifiers(n.Pipe)
 	case *parse.ActionNode:
-		return parseNode(n.Pipe)
+		return Identifiers(n.Pipe)
+	case *parse.BranchNode:
+		names := Identifiers(n.Pipe)
+		for _, n := range []*parse.ListNode{n.List, n.ElseList} {
+			if n != nil {
+				names = append(names, Identifiers(n)...)
+			}
+		}
+		return names
 	case *parse.IfNode:
-		names := parseNode(n.Pipe)
-		names = append(names, parseNode(n.List)...)
-		if n.ElseList != nil {
-			names = append(names, parseNode(n.ElseList)...)
-		}
-		return names
+		return Identifiers(&n.BranchNode)
 	case *parse.RangeNode:
-		names := parseNode(n.Pipe)
-		names = append(names, parseNode(n.List)...)
-		if n.ElseList != nil {
-			names = append(names, parseNode(n.ElseList)...)
-		}
-		return names
+		return Identifiers(&n.BranchNode)
 	case *parse.WithNode:
-		names := parseNode(n.Pipe)
-		names = append(names, parseNode(n.List)...)
-		if n.ElseList != nil {
-			names = append(names, parseNode(n.ElseList)...)
-		}
-		return names
+		return Identifiers(&n.BranchNode)
 	case *parse.PipeNode:
 		var names []string
 		for _, c := range n.Cmds {
 			for _, a := range c.Args {
-				names = append(names, parseNode(a)...)
+				names = append(names, Identifiers(a)...)
 			}
 		}
-		return names
-	case *parse.ListNode:
-		var names []string
-		for _, n := range n.Nodes {
-			names = append(names, parseNode(n)...)
-		}
-
 		return names
 	case *parse.FieldNode:
 		return n.Ident
-	case *parse.TemplateNode:
-		return parseNode(n.Pipe)
+	case *parse.VariableNode:
+		return n.Ident
 	}
 
 	return nil
diff --git a/template/template_test.go b/template/template_test.go
index c678f1b1..b46e1df5 100644
--- a/template/template_test.go
+++ b/template/template_test.go
@@ -260,6 +260,26 @@ func TestExecuteWithMessages(t *testing.T) {
 
 Hello friend![/INST] Hello human![INST] What is your name?[/INST] `,
 		},
+		{
+			"mistral assistant",
+			[]template{
+				{"no response", `[INST] {{ .Prompt }}[/INST] `},
+				{"response", `[INST] {{ .Prompt }}[/INST] {{ .Response }}`},
+				{"messages", `
+{{- range $i, $m := .Messages }}
+{{- if eq .Role "user" }}[INST] {{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }}{{ end }}
+{{- end }}`},
+			},
+			Values{
+				Messages: []api.Message{
+					{Role: "user", Content: "Hello friend!"},
+					{Role: "assistant", Content: "Hello human!"},
+					{Role: "user", Content: "What is your name?"},
+					{Role: "assistant", Content: "My name is Ollama and I"},
+				},
+			},
+			`[INST] Hello friend![/INST] Hello human![INST] What is your name?[/INST] My name is Ollama and I`,
+		},
 		{
 			"chatml",
 			[]template{
@@ -359,3 +379,38 @@ Answer: `,
 		})
 	}
 }
+
+func TestExecuteWithSuffix(t *testing.T) {
+	tmpl, err := Parse(`{{- if .Suffix }}
 {{ .Prompt }} {{ .Suffix }} 
+{{- else }}{{ .Prompt }}
+{{- end }}`)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	cases := []struct {
+		name   string
+		values Values
+		expect string
+	}{
+		{
+			"message", Values{Messages: []api.Message{{Role: "user", Content: "hello"}}}, "hello",
+		},
+		{
+			"prompt suffix", Values{Prompt: "def add(", Suffix: "return x"}, "
 def add( return x ",
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			var b bytes.Buffer
+			if err := tmpl.Execute(&b, tt.values); err != nil {
+				t.Fatal(err)
+			}
+
+			if diff := cmp.Diff(b.String(), tt.expect); diff != "" {
+				t.Errorf("mismatch (-got +want):\n%s", diff)
+			}
+		})
+	}
+}
diff --git a/template/vicuna.json b/template/vicuna.json
new file mode 100644
index 00000000..ed7bfb0f
--- /dev/null
+++ b/template/vicuna.json
@@ -0,0 +1,6 @@
+{
+  "stop": [
+    "USER:",
+    "ASSISTANT:"
+  ]
+}
diff --git a/template/zephyr.json b/template/zephyr.json
new file mode 100644
index 00000000..f9c0115c
--- /dev/null
+++ b/template/zephyr.json
@@ -0,0 +1,8 @@
+{
+  "stop": [
+    "<|system|>",
+    "",
+    "<|user|>",
+    "<|assistant|>"
+  ]
+}