Remove mem check

cmd: preserve exact bytes when displaying template/system layers (#7586 )
runner.go: Fix off-by-one for num predicted
2024-11-14 13:26:13 +01:00 · 2024-11-13 23:53:30 -08:00 · 2024-11-12 11:35:57 -08:00 · 2024-11-12 11:22:39 -08:00 · 2024-11-12 10:31:52 -08:00 · 2024-11-12 09:13:23 -08:00
18 changed files with 180 additions and 126 deletions
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@ -281,7 +281,7 @@ jobs:
        shell: bash
      - uses: golangci/golangci-lint-action@v6
        with:
-          args: --timeout 8m0s -v
+          args: --timeout 10m0s -v
  test:
    strategy:
      matrix:
--- a/72
+++ b/72
@ -5,6 +5,8 @@ ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
 ARG CUDA_VERSION_12=12.4.0
 ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
 ARG ROCM_VERSION=6.1.2
+ARG JETPACK_6=r36.2.0
+ARG JETPACK_5=r35.4.1

 ### To create a local image for building linux binaries on mac or windows with efficient incremental builds
 #
@ -13,7 +15,7 @@ ARG ROCM_VERSION=6.1.2
 #
 ### Then incremental builds will be much faster in this container
 #
-# make -C llama -j 10 && go build -trimpath -o dist/linux-amd64/ollama .
+# make -j 10 && go build -trimpath -o dist/linux-amd64/ollama .
 #
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
 ARG CMAKE_VERSION
@ -76,9 +78,9 @@ ARG CUDA_V12_ARCHITECTURES
 ARG OLLAMA_FAST_BUILD
 RUN --mount=type=cache,target=/root/.ccache \
    if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
-        make -C llama -j $(expr $(nproc) / 2 ) ; \
+        make -j $(expr $(nproc) / 2 ) ; \
    else \
-        make -C llama -j 5 ; \
+        make -j 5 ; \
    fi

 FROM --platform=linux/arm64 unified-builder-arm64 AS runners-arm64
@ -90,7 +92,46 @@ ARG CUDA_V11_ARCHITECTURES
 ARG CUDA_V12_ARCHITECTURES
 ARG OLLAMA_FAST_BUILD
 RUN --mount=type=cache,target=/root/.ccache \
-    make -C llama -j 8
+    make -j 5
+
+# Jetsons need to be built in discrete stages
+FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS runners-jetpack5-arm64
+ARG GOLANG_VERSION
+RUN apt-get update && apt-get install -y git curl ccache && \
+    curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz | tar xz -C /usr/local && \
+    ln -s /usr/local/go/bin/go /usr/local/bin/go && \
+    ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+WORKDIR /go/src/github.com/ollama/ollama/
+COPY . .
+ARG CGO_CFLAGS
+ENV GOARCH arm64
+RUN --mount=type=cache,target=/root/.ccache \
+    make -j 5 cuda_v11 \
+        CUDA_ARCHITECTURES="72;87" \
+        GPU_RUNNER_VARIANT=_jetpack5 \
+        CGO_EXTRA_LDFLAGS_LINUX=-L/usr/local/cuda/lib64/stubs \
+        DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama \
+        DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama/cuda_jetpack5
+
+FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS runners-jetpack6-arm64
+ARG GOLANG_VERSION
+RUN apt-get update && apt-get install -y git curl ccache && \
+    curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz | tar xz -C /usr/local && \
+    ln -s /usr/local/go/bin/go /usr/local/bin/go && \
+    ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+WORKDIR /go/src/github.com/ollama/ollama/
+COPY . .
+ARG CGO_CFLAGS
+ENV GOARCH arm64
+RUN --mount=type=cache,target=/root/.ccache \
+    make -j 5 cuda_v12 \
+        CUDA_ARCHITECTURES="87" \
+        GPU_RUNNER_VARIANT=_jetpack6 \
+        CGO_EXTRA_LDFLAGS_LINUX=-L/usr/local/cuda/lib64/stubs \
+        DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama \
+        DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama/cuda_jetpack6


 # Intermediate stages used for ./scripts/build_linux.sh
@ -134,12 +175,20 @@ FROM --platform=linux/arm64 builder-arm64 AS build-arm64
 COPY . .
 COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/build/ build/
+COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/build/ build/
+COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/build/ build/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
    go build -trimpath -o dist/linux-arm64/bin/ollama .
 RUN cd dist/linux-$GOARCH && \
    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
+RUN cd dist/linux-$GOARCH-jetpack5 && \
+    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack5.tgz
+RUN cd dist/linux-$GOARCH-jetpack6 && \
+    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack6.tgz

 FROM --platform=linux/amd64 scratch AS dist-amd64
 COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
@ -180,16 +229,23 @@ RUN rm -rf \
 FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
-    rm -rf /var/lib/apt/lists/*
+    apt-get clean && rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
 COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/

 FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
+COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ /lib/
+COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ /lib/
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
-    rm -rf /var/lib/apt/lists/*
+    apt-get clean && rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
-COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+COPY --from=cpu-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+COPY --from=cuda-build-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+

 # ROCm libraries larger so we keep it distinct from the CPU/CUDA image
 FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
@ -198,7 +254,7 @@ FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
 COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
-    rm -rf /var/lib/apt/lists/*
+    apt-get clean && rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
 COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/

--- a/README.md
+++ b/README.md
@ -47,26 +47,28 @@ Ollama supports a list of models available on [ollama.com/library](https://ollam

 Here are some example models that can be downloaded:

-| Model              | Parameters | Size  | Download                       |
-| ------------------ | ---------- | ----- | ------------------------------ |
-| Llama 3.2          | 3B         | 2.0GB | `ollama run llama3.2`          |
-| Llama 3.2          | 1B         | 1.3GB | `ollama run llama3.2:1b`       |
-| Llama 3.1          | 8B         | 4.7GB | `ollama run llama3.1`          |
-| Llama 3.1          | 70B        | 40GB  | `ollama run llama3.1:70b`      |
-| Llama 3.1          | 405B       | 231GB | `ollama run llama3.1:405b`     |
-| Phi 3 Mini         | 3.8B       | 2.3GB | `ollama run phi3`              |
-| Phi 3 Medium       | 14B        | 7.9GB | `ollama run phi3:medium`       |
-| Gemma 2            | 2B         | 1.6GB | `ollama run gemma2:2b`         |
-| Gemma 2            | 9B         | 5.5GB | `ollama run gemma2`            |
-| Gemma 2            | 27B        | 16GB  | `ollama run gemma2:27b`        |
-| Mistral            | 7B         | 4.1GB | `ollama run mistral`           |
-| Moondream 2        | 1.4B       | 829MB | `ollama run moondream`         |
-| Neural Chat        | 7B         | 4.1GB | `ollama run neural-chat`       |
-| Starling           | 7B         | 4.1GB | `ollama run starling-lm`       |
-| Code Llama         | 7B         | 3.8GB | `ollama run codellama`         |
-| Llama 2 Uncensored | 7B         | 3.8GB | `ollama run llama2-uncensored` |
-| LLaVA              | 7B         | 4.5GB | `ollama run llava`             |
-| Solar              | 10.7B      | 6.1GB | `ollama run solar`             |
+| Model              | Parameters | Size  | Download                         |
+| ------------------ | ---------- | ----- | -------------------------------- |
+| Llama 3.2          | 3B         | 2.0GB | `ollama run llama3.2`            |
+| Llama 3.2          | 1B         | 1.3GB | `ollama run llama3.2:1b`         |
+| Llama 3.2 Vision   | 11B        | 7.9GB | `ollama run llama3.2-vision`     |
+| Llama 3.2 Vision   | 90B        | 55GB  | `ollama run llama3.2-vision:90b` |
+| Llama 3.1          | 8B         | 4.7GB | `ollama run llama3.1`            |
+| Llama 3.1          | 70B        | 40GB  | `ollama run llama3.1:70b`        |
+| Llama 3.1          | 405B       | 231GB | `ollama run llama3.1:405b`       |
+| Phi 3 Mini         | 3.8B       | 2.3GB | `ollama run phi3`                |
+| Phi 3 Medium       | 14B        | 7.9GB | `ollama run phi3:medium`         |
+| Gemma 2            | 2B         | 1.6GB | `ollama run gemma2:2b`           |
+| Gemma 2            | 9B         | 5.5GB | `ollama run gemma2`              |
+| Gemma 2            | 27B        | 16GB  | `ollama run gemma2:27b`          |
+| Mistral            | 7B         | 4.1GB | `ollama run mistral`             |
+| Moondream 2        | 1.4B       | 829MB | `ollama run moondream`           |
+| Neural Chat        | 7B         | 4.1GB | `ollama run neural-chat`         |
+| Starling           | 7B         | 4.1GB | `ollama run starling-lm`         |
+| Code Llama         | 7B         | 3.8GB | `ollama run codellama`           |
+| Llama 2 Uncensored | 7B         | 3.8GB | `ollama run llama2-uncensored`   |
+| LLaVA              | 7B         | 4.5GB | `ollama run llava`               |
+| Solar              | 10.7B      | 6.1GB | `ollama run solar`               |

 > [!NOTE]
 > You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
@ -359,6 +361,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
 - [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe)
 - [vim-intelligence-bridge](https://github.com/pepo-ec/vim-intelligence-bridge) Simple interaction of "Ollama" with the Vim editor
+- [aichat](https://github.com/sigoden/aichat) All-in-one LLM CLI tool featuring Shell Assistant, Chat-REPL, RAG, AI tools & agents, with access to OpenAI, Claude, Gemini, Ollama, Groq, and more.

 ### Apple Vision Pro
 - [Enchanted](https://github.com/AugustDev/enchanted)
@ -415,6 +418,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)
 - [Agents-Flex for Java](https://github.com/agents-flex/agents-flex) with [example](https://github.com/agents-flex/agents-flex/tree/main/agents-flex-llm/agents-flex-llm-ollama/src/test/java/com/agentsflex/llm/ollama)
 - [Ollama for Swift](https://github.com/mattt/ollama-swift)
+- [GoLamify](https://github.com/prasad89/golamify)

 ### Mobile

@ -452,6 +456,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
 - [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
 - [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
+- [Local AI Helper](https://github.com/ivostoykov/localAI) (Chrome and Firefox extensions that enable interactions with the active tab and customisable API endpoints. Includes secure storage for user prompts.)
 - [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
 - [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
 - [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
--- a/api/client.go
+++ b/api/client.go
@ -55,7 +55,7 @@ func checkError(resp *http.Response, body []byte) error {

 // ClientFromEnvironment creates a new [Client] using configuration from the
 // environment variable OLLAMA_HOST, which points to the network host and
-// port on which the ollama service is listenting. The format of this variable
+// port on which the ollama service is listening. The format of this variable
 // is:
 //
 //	<scheme>://<host>:<port>
--- a/api/types.go
+++ b/api/types.go
@ -12,7 +12,7 @@ import (
 	"time"
 )

-// StatusError is an error with and HTTP status code.
+// StatusError is an error with an HTTP status code and message.
 type StatusError struct {
 	StatusCode   int
 	Status       string
@ -57,7 +57,7 @@ type GenerateRequest struct {
 	Template string `json:"template"`

 	// Context is the context parameter returned from a previous call to
-	// Generate call. It can be used to keep a short conversational memory.
+	// [Client.Generate]. It can be used to keep a short conversational memory.
 	Context []int `json:"context,omitempty"`

 	// Stream specifies whether the response is streaming; it is true by default.
@ -90,14 +90,14 @@ type ChatRequest struct {
 	// Messages is the messages of the chat - can be used to keep a chat memory.
 	Messages []Message `json:"messages"`

-	// Stream enable streaming of returned response; true by default.
+	// Stream enables streaming of returned responses; true by default.
 	Stream *bool `json:"stream,omitempty"`

 	// Format is the format to return the response in (e.g. "json").
 	Format string `json:"format"`

 	// KeepAlive controls how long the model will stay loaded into memory
-	// followin the request.
+	// following the request.
 	KeepAlive *Duration `json:"keep_alive,omitempty"`

 	// Tools is an optional list of tools the model has access to.
@ -203,8 +203,8 @@ type Metrics struct {
 	EvalDuration       time.Duration `json:"eval_duration,omitempty"`
 }

-// Options specified in [GenerateRequest], if you add a new option here add it
-// to the API docs also.
+// Options specified in [GenerateRequest].  If you add a new option here, also
+// add it to the API docs.
 type Options struct {
 	Runner

--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@ -800,9 +800,9 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 		case "parameters":
 			fmt.Println(resp.Parameters)
 		case "system":
-			fmt.Println(resp.System)
+			fmt.Print(resp.System)
 		case "template":
-			fmt.Println(resp.Template)
+			fmt.Print(resp.Template)
 		}

 		return nil
--- a/discover/amd_linux.go
+++ b/discover/amd_linux.go
@ -350,7 +350,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				return nil, err
 			}
 		}
-		gpuInfo.DependencyPath = libDir
+		gpuInfo.DependencyPath = []string{libDir}

 		if gfxOverride == "" {
 			// Only load supported list once
--- a/discover/amd_windows.go
+++ b/discover/amd_windows.go
@ -111,7 +111,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				UnreliableFreeMemory: true,

 				ID:             strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
-				DependencyPath: libDir,
+				DependencyPath: []string{libDir},
 				MinimumMemory:  rocmMinimumMemory,
 				Name:           name,
 				Compute:        gfx,
--- a/discover/gpu.go
+++ b/discover/gpu.go
@ -240,7 +240,7 @@ func GetGPUInfo() GpuInfoList {
 					Library:        "cpu",
 					Variant:        cpuCapability.String(),
 					ID:             "0",
-					DependencyPath: depPath,
+					DependencyPath: []string{depPath},
 				},
 				CPUs: details,
 			},
@ -293,11 +293,11 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.DriverMinor = driverMinor
 				variant := cudaVariant(gpuInfo)
 				if depPath != "" {
-					gpuInfo.DependencyPath = depPath
+					gpuInfo.DependencyPath = []string{depPath}
 					// Check for variant specific directory
 					if variant != "" {
 						if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil {
-							gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+variant)
+							gpuInfo.DependencyPath = []string{filepath.Join(depPath, "cuda_"+variant), depPath}
 						}
 					}
 				}
@ -370,7 +370,7 @@ func GetGPUInfo() GpuInfoList {
 						gpuInfo.FreeMemory = uint64(memInfo.free)
 						gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 						gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-						gpuInfo.DependencyPath = depPath
+						gpuInfo.DependencyPath = []string{depPath}
 						oneapiGPUs = append(oneapiGPUs, gpuInfo)
 					}
 				}
--- a/discover/types.go
+++ b/discover/types.go
@ -25,7 +25,7 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
 	MinimumMemory uint64 `json:"-"`

 	// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
-	DependencyPath string `json:"lib_path,omitempty"`
+	DependencyPath []string `json:"lib_path,omitempty"`

 	// Extra environment variables specific to the GPU as list of [key,value]
 	EnvWorkarounds [][2]string `json:"envs,omitempty"`
--- a/docs/import.md
+++ b/docs/import.md
@ -32,7 +32,7 @@ ollama run my-model

 Ollama supports importing adapters based on several different model architectures including:

-  * Llama (including Llama 2, Llama 3, and Llama 3.1);
+  * Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2);
  * Mistral (including Mistral 1, Mistral 2, and Mixtral); and
  * Gemma (including Gemma 1 and Gemma 2)

@ -67,14 +67,12 @@ ollama run my-model

 Ollama supports importing models for several different architectures including:

-  * Llama (including Llama 2, Llama 3, and Llama 3.1);
+  * Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2);
  * Mistral (including Mistral 1, Mistral 2, and Mixtral);
  * Gemma (including Gemma 1 and Gemma 2); and
  * Phi3

-This includes importing foundation models as well as any fine tuned models which which have been _fused_ with a foundation model.
-
-
+This includes importing foundation models as well as any fine tuned models which have been _fused_ with a foundation model.
 ## Importing a GGUF based model or adapter

 If you have a GGUF based model or adapter it is possible to import it into Ollama. You can obtain a GGUF model or adapter by:
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@ -120,7 +120,7 @@ FROM <model directory>
 The model directory should contain the Safetensors weights for a supported architecture.

 Currently supported model architectures:
-  * Llama (including Llama 2, Llama 3, and Llama 3.1)
+  * Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2)
  * Mistral (including Mistral 1, Mistral 2, and Mixtral)
  * Gemma (including Gemma 1 and Gemma 2)
  * Phi3
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@ -95,7 +95,9 @@ If none of those resolve the problem, gather additional information and file an

 On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device.  If permissions are not set up correctly, Ollama will detect this and report an error in the server log.

-When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -ld /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the group assignments on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.
+When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.   For example, in the following output `crw-rw---- 1 0  44 226,   0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44` 
+
+If Ollama initially works on the GPU in a docker container, but then switches to running on CPU after some period of time with errors in the server log reporting GPU discovery failures, this can be resolved by disabling systemd cgroup management in Docker.  Edit `/etc/docker/daemon.json` on the host and add `"exec-opts": ["native.cgroupdriver=cgroupfs"]` to the docker configuration.

 If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
 - `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries.  This can help show more detailed error codes that can help troubleshoot problems
--- a/llama/llama.go
+++ b/llama/llama.go
@ -21,6 +21,8 @@ package llama
 #cgo cuda CFLAGS: -fPIE -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 #cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 #cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
+#cgo cuda_jetpack5 LDFLAGS: -lggml_cuda_jetpack5 -L/usr/local/cuda-11/lib64
+#cgo cuda_jetpack6 LDFLAGS: -lggml_cuda_jetpack6 -L/usr/local/cuda-12/lib64
 #cgo cuda_v11 LDFLAGS: -lggml_cuda_v11 -L/usr/local/cuda-11/lib64
 #cgo cuda_v12 LDFLAGS: -lggml_cuda_v12 -L/usr/local/cuda-12/lib64
 #cgo darwin,amd64 CFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
@ -36,8 +38,8 @@ package llama
 #cgo linux CXXFLAGS: -D_GNU_SOURCE
 #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
 #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
-#cgo linux,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA -D__ARM_FEATURE_MATMUL_INT8
-#cgo linux,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA -D__ARM_FEATURE_MATMUL_INT8
+#cgo linux,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
+#cgo linux,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
 #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/Linux/arm64
 #cgo linux,arm64,sve CFLAGS: -march=armv8.6-a+sve
 #cgo linux,arm64,sve CXXFLAGS: -march=armv8.6-a+sve
--- a/llama/make/cuda.make
+++ b/llama/make/cuda.make
@ -20,7 +20,7 @@ GPU_COMPILER_CFLAGS_LINUX = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
 GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602
 GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
 GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT)*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
-GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(DIST_LIB_DIR)/,$(notdir $(GPU_LIBS))))
+GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_LIBS))))

 ifeq ($(OS),linux)
 	CUDA_PATH?=/usr/local/cuda
--- a/llama/runner/cache.go
+++ b/llama/runner/cache.go
@ -2,6 +2,7 @@ package main

 import (
 	"errors"
+	"fmt"
 	"log/slog"
 	"reflect"
 	"time"
@ -22,7 +23,11 @@ type InputCache struct {
 	lc *llama.Context
 }

-func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache bool) *InputCache {
+func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache bool) (*InputCache, error) {
+	if kvSize/numSlots < 1 {
+		return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots)
+	}
+
 	slots := make([]InputCacheSlot, numSlots)

 	for i := range slots {
@ -37,7 +42,7 @@ func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache b
 		slots:          slots,
 		multiUserCache: multiUserCache,
 		lc:             lc,
-	}
+	}, nil
 }

 // Locking: Operations on InputCacheSlot (including finding one
@ -58,7 +63,7 @@ type InputCacheSlot struct {
 	lastUsed time.Time
 }

-func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCacheSlot, []input, int, error) {
+func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCacheSlot, []input, error) {
 	var slot *InputCacheSlot
 	var numPast int
 	var err error
@ -75,7 +80,7 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
 		slot, numPast, err = c.findBestCacheSlot(prompt)
 	}
 	if err != nil {
-		return nil, nil, 0, err
+		return nil, nil, err
 	}

 	if !cachePrompt {
@ -102,7 +107,7 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
 	prompt = prompt[numPast:]
 	slot.Inputs = slot.Inputs[:numPast]

-	return slot, prompt, numPast, nil
+	return slot, prompt, nil
 }

 func (c *InputCache) findLongestCacheSlot(prompt []input) (*InputCacheSlot, int, error) {
@ -194,14 +199,30 @@ func countCommonPrefix(a []input, b []input) int {
 	return count
 }

-func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int, numDiscard int, numPast int) {
-	// TODO (jessegross): KV cache removal can fail for certain types of models
-	// server.cpp doesn't handle this, though we can be more graceful
-	c.lc.KvCacheSeqRm(slot.Id, numKeep, numKeep+numDiscard)
-	c.lc.KvCacheSeqAdd(slot.Id, numKeep+numDiscard, numPast, -numDiscard)
+// Frees up space in the KV cache by deleting the oldest half of history and shifting
+// the newest half into that space (saving numKeep inputs at the beginning).
+//
+// Assumes that at least 1 entry can be freed up by shifting (i.e. numKeep < numCtx)
+func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int) {
+	targetFree := (c.numCtx - numKeep) / 2
+	targetFree = max(targetFree, 1)

-	for i := numKeep + numDiscard; i < len(slot.Inputs); i++ {
-		slot.Inputs[i-numDiscard] = slot.Inputs[i]
+	currentFree := c.numCtx - len(slot.Inputs)
+	discard := targetFree - currentFree
+
+	if discard <= 0 {
+		return
 	}
-	slot.Inputs = slot.Inputs[:len(slot.Inputs)-numDiscard]
+
+	slog.Debug("context limit hit - shifting", "limit", c.numCtx, "input", len(slot.Inputs),
+		"keep", numKeep, "discard", discard)
+
+	// TODO (jessegross): KV cache removal can fail for certain types of models
+	c.lc.KvCacheSeqRm(slot.Id, numKeep, numKeep+discard)
+	c.lc.KvCacheSeqAdd(slot.Id, numKeep+discard, len(slot.Inputs), -discard)
+
+	for i := numKeep + discard; i < len(slot.Inputs); i++ {
+		slot.Inputs[i-discard] = slot.Inputs[i]
+	}
+	slot.Inputs = slot.Inputs[:len(slot.Inputs)-discard]
 }
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@ -34,9 +34,6 @@ type input struct {
 }

 type Sequence struct {
-	// number of inputs evaluated
-	numPast int
-
 	// batch index
 	iBatch int

@ -112,21 +109,15 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen
 		params.numKeep = len(inputs)
 	}

-	if !params.embedding {
-		// Subtracting 4 ensures that at least 1 input can be discarded during shift
-		params.numKeep = min(params.numKeep, s.cache.numCtx-4)
-		params.numKeep += s.bosToken
-	} else {
-		// Embeddings are 1 shot - just truncate to the context window, without ever shifting
-		params.numKeep = min(params.numKeep, s.cache.numCtx)
+	if s.model.AddBOSToken() {
+		params.numKeep += 1
 	}

-	// truncate to fit in context window
+	// Ensure that at least 1 input can be discarded during shift
+	params.numKeep = min(params.numKeep, s.cache.numCtx-1)
+
 	if len(inputs) > s.cache.numCtx {
-		slog.Warn("truncating input prompt", "limit", s.cache.numCtx, "prompt", len(inputs), "numKeep", params.numKeep)
-		newInputs := inputs[:params.numKeep]
-		newInputs = append(newInputs, inputs[len(inputs)-s.cache.numCtx+params.numKeep:]...)
-		inputs = newInputs
+		slog.Warn("input exceeds context length", "prompt", len(inputs), "limit", s.cache.numCtx)
 	}

 	var sc *llama.SamplingContext
@ -231,9 +222,6 @@ type Server struct {
 	// KV cache
 	cache *InputCache

-	// does this model require a beginning of sequence token?
-	bosToken int
-
 	// next sequence for prompt processing to avoid starvation
 	nextSeq int

@ -258,18 +246,6 @@ func (s *Server) allNil() bool {
 	return true
 }

-func (s *Server) shiftContext(seq *Sequence) {
-	numLeft := seq.numPast - seq.numKeep
-	numDiscard := numLeft / 2
-
-	slog.Debug("context limit hit - shifting", "limit", s.cache.numCtx, "numPast", seq.numPast,
-		"numKeep", seq.numKeep, "numLeft", numLeft, "numDiscard", numDiscard)
-
-	s.cache.ShiftCacheSlot(seq.cache, seq.numKeep, numDiscard, seq.numPast)
-
-	seq.numPast -= numDiscard
-}
-
 func flushPending(seq *Sequence) bool {
 	joined := strings.Join(seq.pendingResponses, "")
 	seq.pendingResponses = []string{}
@ -369,17 +345,24 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		}

 		// if past the num predict limit
-		if seq.numPredict > 0 && seq.numPredicted > seq.numPredict {
+		if seq.numPredict > 0 && seq.numPredicted >= seq.numPredict {
 			s.removeSequence(seqIdx, "limit")
 			continue
 		}

-		if seq.numPast+len(seq.inputs) > s.cache.numCtx {
-			s.shiftContext(seq)
-		}
-
 		var numInputsProcessed int
+		shifted := false
+
 		for i, input := range seq.inputs {
+			if len(seq.cache.Inputs)+1 > s.cache.numCtx {
+				if !shifted {
+					s.cache.ShiftCacheSlot(seq.cache, seq.numKeep)
+					shifted = true
+				} else {
+					break
+				}
+			}
+
 			embedding := input.embed != nil

 			// If we don't currently have a batch, use one of the correct type and
@ -403,13 +386,12 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			}

 			crossAttention = seq.crossAttention
-			batch.Add(input.token, input.embed, seq.numPast, numInputsProcessed+1 == len(seq.inputs), seq.cache.Id)
-			seq.numPast++
+			batch.Add(input.token, input.embed, len(seq.cache.Inputs), i+1 == len(seq.inputs), seq.cache.Id)
+			seq.cache.Inputs = append(seq.cache.Inputs, input)
 			numInputsProcessed++
 		}

 		if numInputsProcessed > 0 {
-			seq.cache.Inputs = append(seq.cache.Inputs, seq.inputs[:numInputsProcessed]...)
 			seq.inputs = seq.inputs[numInputsProcessed:]
 			seq.iBatch = batch.NumTokens() - 1
 		}
@ -632,7 +614,7 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 	s.mu.Lock()
 	for i, sq := range s.seqs {
 		if sq == nil {
-			seq.cache, seq.inputs, seq.numPast, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
+			seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
 			if err != nil {
 				s.mu.Unlock()
 				http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
@ -715,7 +697,7 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
 	s.mu.Lock()
 	for i, sq := range s.seqs {
 		if sq == nil {
-			seq.cache, seq.inputs, seq.numPast, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
+			seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
 			if err != nil {
 				s.mu.Unlock()
 				http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
@ -802,10 +784,6 @@ func (s *Server) loadModel(
 		}
 	}

-	if s.model.AddBOSToken() {
-		s.bosToken = 1
-	}
-
 	if ppath != "" {
 		var err error
 		s.image, err = NewImageContext(s.lc, ppath)
@ -814,7 +792,10 @@ func (s *Server) loadModel(
 		}
 	}

-	s.cache = NewInputCache(s.lc, kvSize, s.parallel, multiUserCache)
+	s.cache, err = NewInputCache(s.lc, kvSize, s.parallel, multiUserCache)
+	if err != nil {
+		panic(err)
+	}

 	s.status = ServerStatusReady
 	s.ready.Done()
--- a/llm/server.go
+++ b/llm/server.go
@ -128,17 +128,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		}
 	}

-	// On linux and windows, over-allocating CPU memory will almost always result in an error
-	// Darwin has fully dynamic swap so has no direct concept of free swap space
-	if runtime.GOOS != "darwin" {
-		systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize
-		available := systemFreeMemory + systemSwapFreeMemory
-		if systemMemoryRequired > available {
-			slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", available, "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "swap", format.HumanBytes2(systemSwapFreeMemory))
-			return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
-		}
-	}
-
 	estimate.log()

 	// Loop through potential servers
@ -306,9 +295,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter

 		// Note: we always put the dependency path first
 		// since this was the exact version we compiled/linked against
-		if gpus[0].DependencyPath != "" {
+		if gpus[0].DependencyPath != nil {
 			// assume gpus from the same library have the same dependency path
-			libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...)
+			libraryPaths = append(gpus[0].DependencyPath, libraryPaths...)
 		}

 		server := filepath.Join(dir, "ollama_llama_server")
Author	SHA1	Message	Date
norohind	1401b24c79	Remove mem check	2024-11-14 13:26:13 +01:00
Blake Mizerany	67691e410d	cmd: preserve exact bytes when displaying template/system layers (#7586 )	2024-11-13 23:53:30 -08:00
Jesse Gross	d7eb05b936	runner.go: Fix off-by-one for num predicted	2024-11-12 11:35:57 -08:00
Daniel Hiltgen	636a743c2b	CI: give windows lint more time (#7635 ) It looks like 8 minutes isn't quite enough and we're seeing sporadic timeouts	2024-11-12 11:22:39 -08:00
Daniel Hiltgen	df011054fa	Jetpack support for Go server (#7217 ) This adds support for the Jetson JetPack variants into the Go runner	2024-11-12 10:31:52 -08:00
Daniel Hiltgen	ac07160c8d	doc: capture numeric group requirement (#6941 ) Docker uses the container filesystem for name resolution, so we can't guide users to use the name of the host group. Instead they must specify the numeric ID.	2024-11-12 09:13:23 -08:00
Daniel Hiltgen	6606e4243c	docs: Capture docker cgroup workaround (#7519 ) GPU support can break on some systems after a while. This captures a known workaround to solve the problem.	2024-11-12 09:12:50 -08:00
Jesse Gross	65973ceb64	runner.go: Make KV entry accounting more robust The structure of the accounting for KV cache shifting was carried over from the old runner but it now doesn't feel natural with the new runner. There are a number of invariants that should hold true but are difficult to reason about. There is at least one bug report that would imply that the invariants are not holding. This reduces the number of implicit assumptions and is more forgiving of unexpected situations. It also improves behavior around which input tokens are kept when truncation occurs. Bug #7545	2024-11-11 20:23:03 -08:00
Joey Zheng	bebef1e50d	readme: add aichat terminal app to community integrations (#7418 )	2024-11-11 16:44:46 -08:00
Evan	d48c1c5a44	api: fix typos in Go Doc comments (#7620 )	2024-11-11 16:21:58 -08:00
Prasad Bhalerao	36a8372b28	readme: add GoLamify to community integrations (#7521 )	2024-11-10 22:38:18 -08:00
Ivo Stoykov	4e94227b5d	readme: add browser extension that enables using Ollama for interacting with web pages (#5827 )	2024-11-10 22:14:22 -08:00
frances720	479d551766	docs: add mentions of Llama 3.2 (#7517 )	2024-11-10 19:04:23 -08:00
Evan	76b2b723b2	api: fix typo in python ClientFromEnvironment docs (#7604 )	2024-11-10 17:30:27 -08:00
Arhan Busam	b8d77cdeab	readme: add llama3.2-vision to model list (#7580 )	2024-11-10 13:36:25 -08:00