Merge 0800fef8d09a6d30b48ddb07c9bfac5e43a7dec7 into 67691e410db7a50b07a64858820b14de9aa91314

2024-11-14 09:04:55 +01:00 · 2024-11-14 09:04:55 +01:00 · 2e02f7d757
commit 2e02f7d757
parent 67691e410d 0800fef8d0
22 changed files with 369 additions and 245 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@ -85,7 +85,7 @@ jobs:
          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make
+          make dist
        name: make
      - uses: actions/upload-artifact@v4
        with:
@ -143,8 +143,8 @@ jobs:
          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make -C llama print-HIP_PATH print-HIP_LIB_DIR
-          make rocm
+          make help-runners
+          make dist_rocm
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-rocm
@ -226,7 +226,7 @@ jobs:
          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make cuda_v$(($env:CUDA_PATH | split-path -leaf) -replace 'v(\d+).*', '$1')
+          make dist_cuda_v$(($env:CUDA_PATH | split-path -leaf) -replace 'v(\d+).*', '$1')
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-cuda-${{ matrix.cuda.version }}
--- a/22
+++ b/22
@ -1,9 +1,7 @@
 ARG GOLANG_VERSION=1.22.8
 ARG CMAKE_VERSION=3.22.1
 ARG CUDA_VERSION_11=11.3.1
-ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
 ARG CUDA_VERSION_12=12.4.0
-ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
 ARG ROCM_VERSION=6.1.2
 ARG JETPACK_6=r36.2.0
 ARG JETPACK_5=r35.4.1
@ -62,7 +60,7 @@ RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/
 ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH:/usr/local/cuda/bin
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
 ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
-ENV GOARCH amd64
+ENV GOARCH arm64
 ENV CGO_ENABLED 1
 WORKDIR /go/src/github.com/ollama/ollama/
 ENTRYPOINT [ "zsh" ]
@ -70,29 +68,21 @@ ENTRYPOINT [ "zsh" ]
 FROM --platform=linux/amd64 unified-builder-amd64 AS runners-amd64
 COPY . .
 ARG OLLAMA_SKIP_CUDA_GENERATE
-ARG OLLAMA_SKIP_CUDA_11_GENERATE
-ARG OLLAMA_SKIP_CUDA_12_GENERATE
 ARG OLLAMA_SKIP_ROCM_GENERATE
-ARG CUDA_V11_ARCHITECTURES
-ARG CUDA_V12_ARCHITECTURES
 ARG OLLAMA_FAST_BUILD
 RUN --mount=type=cache,target=/root/.ccache \
    if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
-        make -j $(expr $(nproc) / 2 ) ; \
+        make -j $(expr $(nproc) / 2 ) dist payload ; \
    else \
-        make -j 5 ; \
+        make -j 5 dist payload ; \
    fi

 FROM --platform=linux/arm64 unified-builder-arm64 AS runners-arm64
 COPY . .
 ARG OLLAMA_SKIP_CUDA_GENERATE
-ARG OLLAMA_SKIP_CUDA_11_GENERATE
-ARG OLLAMA_SKIP_CUDA_12_GENERATE
-ARG CUDA_V11_ARCHITECTURES
-ARG CUDA_V12_ARCHITECTURES
 ARG OLLAMA_FAST_BUILD
 RUN --mount=type=cache,target=/root/.ccache \
-    make -j 5
+    make -j 5 dist payload

 # Jetsons need to be built in discrete stages
 FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS runners-jetpack5-arm64
@ -107,7 +97,7 @@ COPY . .
 ARG CGO_CFLAGS
 ENV GOARCH arm64
 RUN --mount=type=cache,target=/root/.ccache \
-    make -j 5 cuda_v11 \
+    make -j 5 dist_cuda_v11 payload_cuda_v11 \
        CUDA_ARCHITECTURES="72;87" \
        GPU_RUNNER_VARIANT=_jetpack5 \
        CGO_EXTRA_LDFLAGS_LINUX=-L/usr/local/cuda/lib64/stubs \
@ -126,7 +116,7 @@ COPY . .
 ARG CGO_CFLAGS
 ENV GOARCH arm64
 RUN --mount=type=cache,target=/root/.ccache \
-    make -j 5 cuda_v12 \
+    make -j 5 dist_cuda_v12 payload_cuda_v12 \
        CUDA_ARCHITECTURES="87" \
        GPU_RUNNER_VARIANT=_jetpack6 \
        CGO_EXTRA_LDFLAGS_LINUX=-L/usr/local/cuda/lib64/stubs \
--- a/2
+++ b/2
@ -1,4 +1,4 @@
 GOALS := $(or $(MAKECMDGOALS),all)
 .PHONY: $(GOALS)
 $(GOALS):
-	$(MAKE) -C llama $@
+	@$(MAKE) --no-print-directory -C llama $@
--- a/docs/development.md
+++ b/docs/development.md
@ -3,35 +3,24 @@
 Install required tools:

 - go version 1.22 or higher
- gcc version 11.4.0 or higher
+- OS specific C/C++ compiler (see below)
+- GNU Make


+## Overview
+
+Ollama uses a mix of Go and C/C++ code to interface with GPUs.  The C/C++ code is compiled with both CGO and GPU library specific compilers.  A set of GNU Makefiles are used to compile the project.  GPU Libraries are auto-detected based on the typical environment variables used by the respective libraries, but can be overridden if necessary.  The default make target will build the runners and primary Go Ollama application.  Throughout the examples below '-j 5' is suggested for 5 parallel jobs to speed up the build.  You can adjust the job count based on your CPU Core count to optimize build times. To learn more about the other make targets use 'make help'
+
+Once you have built the GPU/CPU runners, you can compile the main application with `go build .` 
+
 ### MacOS

 [Download Go](https://go.dev/dl/)

-Optionally enable debugging and more verbose logging:
-
-```bash
-# At build time
-export CGO_CFLAGS="-g"
-
-# At runtime
-export OLLAMA_DEBUG=1
-```
-
-Get the required libraries and build the native LLM code:  (Adjust the job count based on your number of processors for a faster build)
-
 ```bash
 make -j 5
 ```

-Then build ollama:
-
-```bash
-go build .
-```
-
 Now you can run `ollama`:

 ```bash
@ -51,64 +40,38 @@ _Your operating system distribution may already have packages for NVIDIA CUDA. D
 Install `make`, `gcc` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
 development and runtime packages.

-Typically the build scripts will auto-detect CUDA, however, if your Linux distro
-or installation approach uses unusual paths, you can specify the location by
-specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
-libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
-a set of target CUDA architectures by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
-
-Then generate dependencies:  (Adjust the job count based on your number of processors for a faster build)
+Typically the makefile will auto-detect CUDA, however, if your Linux distro
+or installation approach uses alternative paths, you can specify the location by
+overriding `CUDA_PATH` to the location of the CUDA toolkit. You can customize
+a set of target CUDA architectures by setting `CUDA_ARCHITECTURES` (e.g. `CUDA_ARCHITECTURES=50;60;70`)

 ```
 make -j 5
 ```

-Then build the binary:
-
-```
-go build .
-```
+If both v11 and v12 tookkits are detected, runners for both major versions will be built by default.  You can build just v12 with `make cuda_v12`

 #### Linux ROCm (AMD)

-_Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
+_Your operating system distribution may already have packages for AMD ROCm. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_

-Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/) development packages first, as well as `make`, `gcc`, and `golang`.
+Install [ROCm](https://rocm.docs.amd.com/en/latest/) development packages first, as well as `make`, `gcc`, and `golang`.

 Typically the build scripts will auto-detect ROCm, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
-specifying an environment variable `ROCM_PATH` to the location of the ROCm
-install (typically `/opt/rocm`), and `CLBlast_DIR` to the location of the
-CLBlast install (typically `/usr/lib/cmake/CLBlast`). You can also customize
-the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx1102"`)
-
-Then generate dependencies:  (Adjust the job count based on your number of processors for a faster build)
+specifying an environment variable `HIP_PATH` to the location of the ROCm
+install (typically `/opt/rocm`). You can also customize
+the AMD GPU targets by setting HIP_ARCHS (e.g. `HIP_ARCHS=gfx1101;gfx1102`)

 ```
 make -j 5
 ```

-Then build the binary:
-
-```
-go build .
-```
-
 ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.

-#### Advanced CPU Settings
-
-By default, running `make` will compile a few different variations
-of the LLM library based on common CPU families and vector math capabilities,
-including a lowest-common-denominator which should run on almost any 64 bit CPU
-somewhat slowly. At runtime, Ollama will auto-detect the optimal variation to
-load. 
-
-Custom CPU settings are not currently supported in the new Go server build but will be added back after we complete the transition.
-
 #### Containerized Linux Build

-If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
+If you have Docker and buildx available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting artifacts are placed in `./dist`  and by default the script builds both arm64 and amd64 binaries.  If you want to build only amd64, you can build with `PLATFORM=linux/amd64 ./scripts/build_linux.sh`

 ### Windows

@ -126,12 +89,8 @@ The following tools are required as a minimal development environment to build C
 > [!NOTE]  
 > Due to bugs in the GCC C++ library for unicode support, Ollama should be built with clang on windows.

-Then, build the `ollama` binary:
-
-```powershell
-$env:CGO_ENABLED="1"
-make -j 8
-go build .
+```
+make -j 5
 ```

 #### GPU Support
@ -173,3 +132,30 @@ pacman -S mingw-w64-clang-aarch64-clang mingw-w64-clang-aarch64-gcc-compat mingw
 ```

 You will need to ensure your PATH includes go, cmake, gcc and clang mingw32-make to build ollama from source. (typically `C:\msys64\clangarm64\bin\`)
+
+
+## Advanced CPU Vector Settings
+
+On x86, running `make` will compile several CPU runners which can run on different CPU families. At runtime, Ollama will auto-detect the best variation to load.  If GPU libraries are present at build time, Ollama also compiles GPU runners with the `AVX` CPU vector feature enabled.  This provides a good performance balance when loading large models that split across GPU and CPU with broad compatibility.  Some users may prefer no vector extensions (e.g. older Xeon/Celeron processors, or hypervisors that mask the vector features) while other users may prefer turning on many more vector extensions to further improve performance for split model loads.
+
+To customize the set of CPU vector features enabled for a CPU runner and all GPU runners, use CUSTOM_CPU_FLAGS during the build.
+
+To build without any vector flags:
+
+```
+make CUSTOM_CPU_FLAGS=""
+```
+
+To build with both AVX and AVX2:
+```
+make CUSTOM_CPU_FLAGS=avx,avx2
+```
+
+To build with AVX512 features turned on:
+
+```
+make CUSTOM_CPU_FLAGS=avx,avx2,avx512,avx512vbmi,avx512vnni,avx512bf16
+```
+
+> [!NOTE]  
+> If you are experimenting with different flags, make sure to do a `make clean` between each change to ensure everything is rebuilt with the new compiler flags
--- a/llama/Makefile
+++ b/llama/Makefile
@ -4,53 +4,115 @@ include make/common-defs.make
 RUNNER_TARGETS := default

 # Determine which if any GPU runners we should build
+include make/cuda-v11-defs.make
+include make/cuda-v12-defs.make
+
 ifeq ($(OS),windows)
-	CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown
-	CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null))
-	CUDA_11:=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null)
-	CUDA_12:=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null)
 	HIP_LIB_DIR := $(shell ls -d $(HIP_PATH)/lib 2>/dev/null)
+	HIP_COMPILER:=$(wildcard $(HIP_PATH)/bin/hipcc.bin.exe)
 else ifeq ($(OS),linux)
 	HIP_PATH?=/opt/rocm
 	HIP_LIB_DIR := $(shell ls -d $(HIP_PATH)/lib 2>/dev/null)
-	CUDA_PATH?=/usr/local/cuda
-	CUDA_11:=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null)
-	CUDA_12:=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null)
+	HIP_COMPILER:=$(wildcard $(HIP_PATH)/bin/hipcc)
 endif

+# Without CUSTOM_CPU_FLAGS we default to build both v11 and v12 if present
+ifeq ($(CUSTOM_CPU_FLAGS),)
 ifeq ($(OLLAMA_SKIP_CUDA_GENERATE),)
-ifneq ($(CUDA_11),)
+ifneq ($(CUDA_11_COMPILER),)
 	RUNNER_TARGETS += cuda_v11
 endif
-ifneq ($(CUDA_12),)
+ifneq ($(CUDA_12_COMPILER),)
 	RUNNER_TARGETS += cuda_v12
 endif
 endif
+else # CUSTOM_CPU_FLAGS is set, we'll build only the latest cuda version detected
+ifneq ($(CUDA_12),)
+	RUNNER_TARGETS += cuda_v12
+else ifneq ($(CUDA_11),)
+	RUNNER_TARGETS += cuda_v11
+endif
+endif
+
 ifeq ($(OLLAMA_SKIP_ROCM_GENERATE),)
-ifneq ($(HIP_LIB_DIR),)
+ifneq ($(HIP_COMPILER),)
 	RUNNER_TARGETS += rocm
 endif
 endif


-all: clean-payload .WAIT runners
+all: runners exe
+
+dist: $(addprefix dist_, $(RUNNER_TARGETS)) dist_exe
+
+dist_%:
+	@$(MAKE) --no-print-directory -f make/Makefile.$* dist
+
+payload: $(addprefix payload_, $(RUNNER_TARGETS))
+
+payload_%:
+	@$(MAKE) --no-print-directory -f make/Makefile.$* payload

 runners: $(RUNNER_TARGETS)

 $(RUNNER_TARGETS):
-	$(MAKE) -f make/Makefile.$@
+	@$(MAKE) --no-print-directory -f make/Makefile.$@
+
+exe dist_exe:
+	@$(MAKE) --no-print-directory -f make/Makefile.ollama $@

 help-sync apply-patches create-patches sync:
-	$(MAKE) -f make/Makefile.sync $@
+	@$(MAKE) --no-print-directory -f make/Makefile.sync $@

-clean:
-	rm -rf $(BUILD_DIR) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
+test integration lint:
+	@$(MAKE) --no-print-directory -f make/Makefile.test $@
+
+clean: clean-payload
+	rm -rf $(BUILD_DIR) $(DIST_LIB_DIR)
 	go clean -cache

 clean-payload:
 	rm -rf $(addprefix $(RUNNERS_PAYLOAD_DIR)/, $(RUNNER_TARGETS) metal cpu cpu_avx cpu_avx2)

-.PHONY: all runners clean clean-payload $(RUNNER_TARGETS) .WAIT
+help:
+	@echo "The following make targets will help you BUILD Ollama"
+	@echo ""
+	@echo "	make all   		# (default target) Build Ollama llm subprocess runners, and the primary ollama executable"
+	@echo "	make runners		# Build Ollama llm subprocess runners; after you may use 'go build .' to build the primary ollama exectuable"
+	@echo "	make <runner>		# Build specific runners. Enabled: '$(RUNNER_TARGETS)'"
+	@echo "	make payload		# Build the runners as payloads (Linux/Mac only)"
+	@echo "	make dist		# Build the runners for distribution and gather dependencies"
+	@echo "	make help-sync 		# Help information on vendor update targets"
+	@echo "	make help-runners 	# Help information on runner targets"
+	@echo ""
+	@echo "The following make targets will help you TEST Ollama"
+	@echo ""
+	@echo "	make test   		# Run unit tests"
+	@echo "	make integration	# Run integration tests.  You must 'make all' first"
+	@echo "	make lint   		# Run lint and style tests"
+	@echo ""
+	@echo "For more information see 'docs/development.md'"
+	@echo ""
+
+
+help-runners:
+	@echo "The following runners will be built based on discovered GPU libraries: '$(RUNNER_TARGETS)'"
+	@echo "(On MacOS arm64 'default' is the metal runner.  For all other platforms 'default' is one or more CPU runners)"
+	@echo ""
+	@echo "GPU Runner CPU Flags: '$(GPU_RUNNER_CPU_FLAGS)'  (Override with CUSTOM_CPU_FLAGS)"
+	@echo ""
+	@echo "# CUDA_PATH sets the location where CUDA toolkits are present"
+	@echo "CUDA_PATH=$(CUDA_PATH)"
+	@echo "	CUDA_11=$(CUDA_11)"
+	@echo "	CUDA_11_COMPILER=$(CUDA_11_COMPILER)"
+	@echo "	CUDA_12=$(CUDA_12)"
+	@echo "	CUDA_12_COMPILER=$(CUDA_12_COMPILER)"
+	@echo ""
+	@echo "# HIP_PATH sets the location where the ROCm toolkit is present"
+	@echo "HIP_PATH=$(HIP_PATH)"
+	@echo "	HIP_COMPILER=$(HIP_COMPILER)"
+
+.PHONY: all exe dist payload help help-sync help-runners test integration lint runners clean clean-payload $(RUNNER_TARGETS)

 # Handy debugging for make variables
 print-%:
--- a/llama/llama.go
+++ b/llama/llama.go
@ -9,17 +9,19 @@ package llama
 #cgo amd64,avx CXXFLAGS: -mavx
 #cgo amd64,avx2 CFLAGS: -mavx2 -mfma
 #cgo amd64,avx2 CXXFLAGS: -mavx2 -mfma
+#cgo amd64,avx512 CFLAGS: -mavx512f -mavx512dq -mavx512bw
+#cgo amd64,avx512 CXXFLAGS: -mavx512f -mavx512dq -mavx512bw
+#cgo amd64,avx512bf16 CFLAGS: -mavx512bf16 -D__AVX512BF16__
+#cgo amd64,avx512bf16 CXXFLAGS: -mavx512bf16 -D__AVX512BF16__
+#cgo amd64,avx512vbmi CFLAGS: -mavx512vbmi -D__AVX512VBMI__
+#cgo amd64,avx512vbmi CXXFLAGS: -mavx512vbmi -D__AVX512VBMI__
+#cgo amd64,avx512vnni CFLAGS: -mavx512vnni -D__AVX512VNNI__
+#cgo amd64,avx512vnni CXXFLAGS: -mavx512vnni -D__AVX512VNNI__
 #cgo amd64,f16c CFLAGS: -mf16c
 #cgo amd64,f16c CXXFLAGS: -mf16c
 #cgo amd64,fma CFLAGS: -mfma
 #cgo amd64,fma CXXFLAGS: -mfma
-#cgo avx CFLAGS: -mavx
-#cgo avx CXXFLAGS: -mavx
-#cgo avx2 CFLAGS: -mavx2 -mfma -mf16c
-#cgo avx2 CXXFLAGS: -mavx2 -mfma -mf16c
 #cgo cuda CFLAGS: -fPIE -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
-#cgo cuda CFLAGS: -fPIE -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
-#cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 #cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 #cgo cuda_jetpack5 LDFLAGS: -lggml_cuda_jetpack5 -L/usr/local/cuda-11/lib64
 #cgo cuda_jetpack6 LDFLAGS: -lggml_cuda_jetpack6 -L/usr/local/cuda-12/lib64
@ -36,11 +38,10 @@ package llama
 #cgo darwin,arm64 LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework Accelerate
 #cgo linux CFLAGS: -D_GNU_SOURCE
 #cgo linux CXXFLAGS: -D_GNU_SOURCE
-#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
-#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
+#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/linux-amd64
 #cgo linux,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
 #cgo linux,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
-#cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/Linux/arm64
+#cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/linux-arm64
 #cgo linux,arm64,sve CFLAGS: -march=armv8.6-a+sve
 #cgo linux,arm64,sve CXXFLAGS: -march=armv8.6-a+sve
 #cgo linux,cuda LDFLAGS: -lcuda -lcudart -lcublas -lcublasLt -lpthread -ldl -lrt -lresolv
@ -50,14 +51,11 @@ package llama
 #cgo rocm LDFLAGS: -L${SRCDIR} -lggml_rocm -lhipblas -lamdhip64 -lrocblas
 #cgo windows CFLAGS: -Wno-discarded-qualifiers -D_WIN32_WINNT=0x602
 #cgo windows CXXFLAGS: -D_WIN32_WINNT=0x602
-#cgo windows LDFLAGS: -lmsvcrt
 #cgo windows LDFLAGS: -lmsvcrt -static-libstdc++ -static-libgcc -static
-#cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/Windows/amd64
-#cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/Windows/amd64
+#cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/windows-amd64
 #cgo windows,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
 #cgo windows,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
-#cgo windows,arm64 LDFLAGS: -L${SRCDIR}/build/Windows/arm64
-#cgo windows,arm64 LDFLAGS: -L${SRCDIR}/build/Windows/arm64
+#cgo windows,arm64 LDFLAGS: -L${SRCDIR}/build/windows-arm64
 #cgo windows,cuda LDFLAGS: -lcuda -lcudart -lcublas -lcublasLt
 #cgo windows,rocm LDFLAGS: -lggml_rocm -lhipblas -lamdhip64 -lrocblas

--- a/llama/make/Makefile.cuda_v11
+++ b/llama/make/Makefile.cuda_v11
@ -1,7 +1,7 @@
 # Build rules for CUDA v11 runner

 include make/common-defs.make
-
+include make/cuda-v11-defs.make

 GPU_RUNNER_VARIANT := _v11
 GPU_PATH_ROOT_WIN=$(shell ls -d $(dir $(shell cygpath -m -s "$(CUDA_PATH)\.."))/v11.? 2>/dev/null)
--- a/llama/make/Makefile.cuda_v12
+++ b/llama/make/Makefile.cuda_v12
@ -1,7 +1,7 @@
 # Build rules for CUDA v12 runner

 include make/common-defs.make
-
+include make/cuda-v12-defs.make

 GPU_RUNNER_VARIANT := _v12
 GPU_PATH_ROOT_WIN=$(shell ls -d $(dir $(shell cygpath -m -s "$(CUDA_PATH)\.."))/v12.? 2>/dev/null)
--- a/llama/make/Makefile.default
+++ b/llama/make/Makefile.default
@ -8,7 +8,7 @@ CPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERS
 DEFAULT_RUNNER := $(if $(and $(filter darwin,$(OS)),$(filter arm64,$(ARCH))),metal,cpu)
 RUNNERS := $(DEFAULT_RUNNER)
 ifeq ($(ARCH),amd64)
-ifeq ($(CUSTOM_CPU_FLAGS),)
+ifeq ($(origin CUSTOM_CPU_FLAGS),undefined)
 	RUNNERS += cpu_avx cpu_avx2
 endif
 endif
@ -19,7 +19,11 @@ PAYLOAD_RUNNERS = $(addprefix $(RUNNERS_PAYLOAD_DIR)/,$(addsuffix /ollama_llama_
 endif
 BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(RUNNERS)))

-all: $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
+default: $(BUILD_RUNNERS) 
+
+dist: $(DIST_RUNNERS)
+
+payload: $(PAYLOAD_RUNNERS)

 $(RUNNERS_BUILD_DIR)/$(DEFAULT_RUNNER)/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS=$(CUSTOM_CPU_FLAGS)
 $(RUNNERS_BUILD_DIR)/$(DEFAULT_RUNNER)/ollama_llama_server$(EXE_EXT): *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
@ -47,7 +51,7 @@ $(RUNNERS_PAYLOAD_DIR)/%/ollama_llama_server$(EXE_EXT).gz: $(RUNNERS_BUILD_DIR)/
 clean: 
 	rm -f $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)

-.PHONY: clean all
+.PHONY: clean default dist

 # Handy debugging for make variables
 print-%:
--- a/llama/make/Makefile.ollama
+++ b/llama/make/Makefile.ollama
@ -0,0 +1,20 @@
+# Makefile for building top-level ollama binary
+
+include make/common-defs.make
+
+exe ollama: $(OLLAMA_EXE)
+dist_exe dist_ollama: $(DIST_OLLAMA_EXE)
+
+GO_DEPS=$(foreach dir,$(shell go list -deps -f '{{.Dir}}' ../ ),$(wildcard $(dir)/*.go))
+CPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" \"-X=github.com/ollama/ollama/llama.CpuFeatures=$(subst $(space),$(comma),$(TARGET_CPU_FLAGS))\" $(TARGET_LDFLAGS)"
+PAYLOADS=$(wildcard $(RUNNERS_PAYLOAD_DIR)/*/*.gz)
+
+$(OLLAMA_EXE) $(DIST_OLLAMA_EXE): TARGET_CPU_FLAGS=$(CUSTOM_CPU_FLAGS)
+$(OLLAMA_EXE) $(DIST_OLLAMA_EXE): $(COMMON_SRCS) $(COMMON_HDRS) $(PAYLOADS) $(GO_DEPS) 
+	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath $(if $(CUSTOM_CPU_FLAGS),-tags $(subst $(space),$(comma),$(CUSTOM_CPU_FLAGS)))  -o $@ ../
+
+.PHONY: ollama dist_ollama exe dist_exe
+
+# Handy debugging for make variables
+print-%:
+	@echo '$*=$($*)'
--- a/llama/make/Makefile.rocm
+++ b/llama/make/Makefile.rocm
@ -13,12 +13,12 @@ ifeq ($(OS),windows)
 	CGO_EXTRA_LDFLAGS_WIN := -L$(shell cygpath -m -s "$(HIP_PATH)/lib")
 	GPU_COMPILER_WIN := $(HIP_PATH)/bin/hipcc.bin.exe
 	GPU_COMPILER:=$(GPU_COMPILER_WIN)
+	HIP_ARCHS?=$(HIP_ARCHS_COMMON)
 else ifeq ($(OS),linux)
 	GPU_LIB_DIR_LINUX := $(HIP_PATH)/lib
-	GPU_COMPILER_LINUX := $(shell X=$$(which hipcc 2>/dev/null) && echo $$X)
+	GPU_COMPILER_LINUX := $(wildcard $(HIP_PATH)/bin/hipcc)
 	GPU_COMPILER:=$(GPU_COMPILER_LINUX)
-	ROCM_TRANSITIVE_LIBS_INITIAL = $(sort $(shell ldd $(GPU_LIBS) | grep "=>" | cut -f2 -d= | cut -f2 -d' '  | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf))
-	GPU_TRANSITIVE_LIBS = $(sort $(shell readlink -f $(ROCM_TRANSITIVE_LIBS_INITIAL)) $(ROCM_TRANSITIVE_LIBS_INITIAL))
+	HIP_ARCHS?=$(HIP_ARCHS_COMMON) $(HIP_ARCHS_LINUX)
 endif

 # TODO future multi-variant support for ROCm
@ -38,26 +38,34 @@ GPU_COMPILER_CFLAGS_LINUX = $(CFLAGS) -fPIC -D_GNU_SOURCE
 GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602
 GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -fPIC -D_GNU_SOURCE

-GPU_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))
+# Note: ROCm requires an extra step of discovering and copying the transitive dependencies on linux
 ifeq ($(OS),windows)
-	ROCM_DIST_DEPS_DIR = $(abspath $(SRC_DIR)/../dist/$(OS)-$(ARCH))/lib/ollama
+	ROCM_DIST_DEPS_DIR = ../dist/$(OS)-$(ARCH)/lib/ollama
+	GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
 else ifeq ($(OS),linux)
-	ROCM_DIST_DEPS_DIR = $(abspath $(SRC_DIR)/../dist/$(OS)-$(ARCH)-rocm)/lib/ollama
+	ROCM_DIST_DEPS_DIR = ../dist/$(OS)-$(ARCH)-rocm/lib/ollama
+	GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
+	ROCM_TRANSITIVE_LIBS_INITIAL = $(sort $(shell ldd $(GPU_LIBS) | grep "=>" | cut -f2 -d= | cut -f2 -d' '  | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf))
+	GPU_TRANSITIVE_LIBS = $(sort $(shell readlink -f $(ROCM_TRANSITIVE_LIBS_INITIAL)) $(ROCM_TRANSITIVE_LIBS_INITIAL))
+	FILTERED_GPU_TRANSITIVE_LIBS=$(sort $(filter-out $(addprefix %,$(notdir $(GPU_LIBS))), $(GPU_TRANSITIVE_LIBS)))
+	GPU_DIST_TRANSITIVE_LIB_DEPS = $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(FILTERED_GPU_TRANSITIVE_LIBS))))
 endif
-GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(GPU_LIBS)) $(notdir $(GPU_TRANSITIVE_LIBS))))
+GPU_DIST_LIB_DEPS= $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
 ROCBLAS_DIST_DEP_MANIFEST = $(ROCM_DIST_DEPS_DIR)/rocblas/library/TensileManifest.txt

 ifeq ($(OS),linux)
 	GPU_COMPILER_FPIC := -fPIC -Wno-unused-function -std=gnu++11
-	GPU_RUNNER_ARCH_FLAGS := $(foreach arch, $(HIP_ARCHS_COMMON) $(HIP_ARCHS_LINUX), --offload-arch=$(arch))
 else ifeq ($(OS),windows)
 	GPU_COMPILER_FPIC := -Xclang --dependent-lib=msvcrt
-	GPU_RUNNER_ARCH_FLAGS := $(foreach arch, $(HIP_ARCHS_COMMON), --offload-arch=$(arch))
 endif
+GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(HIP_ARCHS)),--offload-arch=$(arch))
+
+# HIPCC uses clang which requires avx512 -> -mavx512f -mavx512dq -mavx512bw
+GPU_VECTOR_FLAGS=$(if $(filter avx512,$(GPU_RUNNER_CPU_FLAGS)),avx512f avx512dq avx512bw) $(filter-out avx512,$(GPU_RUNNER_CPU_FLAGS))

 GPU_COMPILER_CUFLAGS = \
 	$(GPU_COMPILER_FPIC) \
-	$(addprefix -m,$(GPU_RUNNER_CPU_FLAGS)) \
+	$(addprefix -m,$(GPU_VECTOR_FLAGS)) \
 	-mf16c \
 	-mfma \
 	-parallel-jobs=2 \
@ -101,9 +109,14 @@ endif
 include make/gpu.make

 # Adjust the rules from gpu.make to handle the ROCm dependencies properly
-$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(ROCBLAS_DIST_DEP_MANIFEST)
+$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(ROCBLAS_DIST_DEP_MANIFEST) $(GPU_DIST_TRANSITIVE_LIB_DEPS)
 $(ROCBLAS_DIST_DEP_MANIFEST):
 	@-mkdir -p $(dir $@)
 	@echo "Copying rocblas library..."
-	cd $(GPU_LIB_DIR)/rocblas/library/ && tar cf - . | (cd $(dir $@) && tar xf - )
+	(cd $(GPU_LIB_DIR)/rocblas/library/ && tar cf - . ) | (cd $(dir $@) && tar xf - )
 	@echo "rocblas library copy complete"
+
+$(GPU_DIST_TRANSITIVE_LIB_DEPS):
+	@-mkdir -p $(dir $@)
+	echo "transitive dist deps libs rule cp: $@"
+	$(CP) $(dir $(filter %$(notdir $@),$(GPU_TRANSITIVE_LIBS)))/$(notdir $@) $(dir $@)
--- a/llama/make/Makefile.sync
+++ b/llama/make/Makefile.sync
@ -1,23 +1,24 @@
 # Helpers for managing our vendored llama.cpp repo and patch set

-REPO_ROOT:=$(dir $(patsubst %/,%,$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))))
-DST_DIR:=$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))
+REPO_ROOT:=../../
+DEST_DIR:=./

-include $(REPO_ROOT)llama/vendoring
+include $(DEST_DIR)vendoring

-LLAMACPP_REPO := $(REPO_ROOT)llama/vendor/
+LLAMACPP_REPO := ./vendor/

-LLAMACPP_PATCH_DIR := $(DST_DIR)patches/
+# Relative to the vendor dir
+VENDOR_RELATIVE_PATCH_DIR := ../patches/


 help-sync:
 	@echo "The following make targets will help you update llama.cpp to a new base commit, or work on new features/fixes"
 	@echo ""
-	@echo "\tmake apply-patches   # Establish the tracking repo if not already present, reset to the base commit, and apply our patch set"
-	@echo "\tmake sync            # Vendor llama.cpp and ggml from the tracking repo working tree"
-	@echo "\tmake create-patches  # Generate the patch set based on the current commits in the tracking repo since the base commit"
+	@echo "	make apply-patches   # Establish the tracking repo if not already present, reset to the base commit, and apply our patch set"
+	@echo "	make sync            # Vendor llama.cpp and ggml from the tracking repo working tree"
+	@echo "	make create-patches  # Generate the patch set based on the current commits in the tracking repo since the base commit"
 	@echo ""
-	@echo "For more details on the workflow, see the Vendoring section in ../docs/development.md"
+	@echo "For more details on the workflow, see the Vendoring section in 'docs/development.md'"

 apply-patches: $(LLAMACPP_REPO)
 	@if ! git -C $(LLAMACPP_REPO) --no-pager diff --exit-code ; then \
@ -29,7 +30,7 @@ apply-patches: $(LLAMACPP_REPO)
 	@git -C $(LLAMACPP_REPO) checkout -q $(LLAMACPP_BASE_COMMIT) || \
 		git -C $(LLAMACPP_REPO) fetch --all && git -C $(LLAMACPP_REPO) checkout -q $(LLAMACPP_BASE_COMMIT)
 	@echo "Applying ollama patches..."
-	@git -c 'user.name=nobody' -c 'user.email=<>' -C $(LLAMACPP_REPO) am -3 $(LLAMACPP_PATCH_DIR)/*.patch || \
+	@cd $(LLAMACPP_REPO) && git -c 'user.name=nobody' -c 'user.email=<>' am -3 $(VENDOR_RELATIVE_PATCH_DIR)*.patch || \
 		echo "Please resolve the conflicts in $(LLAMACPP_REPO), and run 'git am --continue' to continue applying subsequent patches"
 	@echo ""
 	@echo "The tracking repo $(LLAMACPP_REPO) is now in a detached state with all patches applied."
@ -44,7 +45,7 @@ create-patches: $(LLAMACPP_REPO)
  		echo "ERROR: Your llama.cpp repo is dirty.  You must commit any pending changes for format-patch to generate patches"; \
  		exit 1; \
 	fi
-	git -C $(LLAMACPP_REPO) format-patch --no-signature --no-numbered --zero-commit -o $(LLAMACPP_PATCH_DIR) $(LLAMACPP_BASE_COMMIT)
+	@cd $(LLAMACPP_REPO) && git format-patch --no-signature --no-numbered --zero-commit -o $(VENDOR_RELATIVE_PATCH_DIR) $(LLAMACPP_BASE_COMMIT)

 # Vendoring template logic
 EXCLUDED_FILES=sgemm.cpp sgemm.h sampling_ext.cpp sampling_ext.h stb_image.h json.hpp llama_darwin.c base64.hpp
@ -86,12 +87,12 @@ LLAMACPP_FILES=\
 	include/llama.h \
 	ggml/src/llamafile/sgemm.cpp \
 	ggml/src/llamafile/sgemm.h
-$(foreach name,$(LLAMACPP_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR))))
+$(foreach name,$(LLAMACPP_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR))))

 # llama.cpp files -> llama/llamafile
 LLAMAFILE_FILES= \
 	ggml/src/llamafile/sgemm.h
-$(foreach name,$(LLAMAFILE_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR)llamafile/)))
+$(foreach name,$(LLAMAFILE_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR)llamafile/)))

 # ggml files -> llama/
 GGML_FILES= \
@ -115,10 +116,10 @@ GGML_FILES= \
 	ggml/src/ggml-cpu-impl.h \
 	ggml/include/ggml-blas.h \
 	ggml/src/ggml-blas.cpp
-$(foreach name,$(GGML_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR))))
+$(foreach name,$(GGML_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR))))

 # TODO generalize renaming pattern if we have more of these
-$(DST_DIR)ggml-metal_darwin_arm64.m : $(LLAMACPP_REPO)ggml/src/ggml-metal.m
+$(DEST_DIR)ggml-metal_darwin_arm64.m : $(LLAMACPP_REPO)ggml/src/ggml-metal.m
 	@echo "vendoring $(subst $(LLAMACPP_REPO),,$<)"; \
 		mkdir -p $(dir $@) && \
 		echo "/**" > $@ && \
@ -128,20 +129,20 @@ $(DST_DIR)ggml-metal_darwin_arm64.m : $(LLAMACPP_REPO)ggml/src/ggml-metal.m
 		echo " */" >> $@ && \
 		echo "" >> $@ && \
 		cat $< >> $@
-VENDORED_FILES += $(DST_DIR)ggml-metal_darwin_arm64.m
+VENDORED_FILES += $(DEST_DIR)ggml-metal_darwin_arm64.m

 # ggml-cuda -> llama/ggml-cuda/
 GGML_CUDA_FILES= ggml/src/ggml-cuda/*.cu ggml/src/ggml-cuda/*.cuh
 GGML_CUDA_FILES_EXPANDED = $(addprefix ggml/src/ggml-cuda/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_CUDA_FILES)))))
-$(foreach name,$(GGML_CUDA_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DST_DIR)ggml-cuda/)))
+$(foreach name,$(GGML_CUDA_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DEST_DIR)ggml-cuda/)))

 GGML_TEMPLATE_FILES= ggml/src/ggml-cuda/template-instances/*.cu
 GGML_TEMPLATE_FILES_EXPANDED = 	$(addprefix ggml/src/ggml-cuda/template-instances/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_TEMPLATE_FILES)))))
-$(foreach name,$(GGML_TEMPLATE_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DST_DIR)ggml-cuda/template-instances/)))
+$(foreach name,$(GGML_TEMPLATE_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DEST_DIR)ggml-cuda/template-instances/)))

 GGML_VENDOR_FILES= ggml/src/ggml-cuda/vendors/*.h
 GGML_VENDOR_FILES_EXPANDED=$(addprefix ggml/src/ggml-cuda/vendors/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_VENDOR_FILES)))))
-$(foreach name,$(GGML_VENDOR_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DST_DIR)ggml-cuda/vendors/)))
+$(foreach name,$(GGML_VENDOR_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DEST_DIR)ggml-cuda/vendors/)))

 # llava -> llama/
 LAVA_FILES= \
@ -163,23 +164,23 @@ LAVA_FILES+= \
 	common/json-schema-to-grammar.cpp \
 	common/json-schema-to-grammar.h \
 	common/base64.hpp
-$(foreach name,$(LAVA_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR))))
+$(foreach name,$(LAVA_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR))))

-$(DST_DIR)build-info.cpp:
+$(DEST_DIR)build-info.cpp:
 	@echo "Generating $@"
 	@echo "int LLAMA_BUILD_NUMBER = 0;" > $@
 	@echo "char const *LLAMA_COMMIT = \"$(LLAMACPP_BASE_COMMIT)\";" >> $@
 	@echo "char const *LLAMA_COMPILER = \"\";" >> $@
 	@echo "char const *LLAMA_BUILD_TARGET = \"\";" >> $@
-VENDORED_FILES += $(DST_DIR)build-info.cpp
+VENDORED_FILES += $(DEST_DIR)build-info.cpp


 sync: $(LLAMACPP_REPO) .WAIT $(VENDORED_FILES) .WAIT remove-stale-files

 PATS=*.c *.h *.cpp *.m *.metal *.cu *.cuh
-NATIVE_DIRS=$(DST_DIR) $(DST_DIR)llamafile/ $(DST_DIR)ggml-cuda/ $(DST_DIR)ggml-cuda/template-instances/ $(DST_DIR)ggml-cuda/vendors/
+NATIVE_DIRS=$(DEST_DIR) $(DEST_DIR)llamafile/ $(DEST_DIR)ggml-cuda/ $(DEST_DIR)ggml-cuda/template-instances/ $(DEST_DIR)ggml-cuda/vendors/
 ALL_NATIVE_FILES=$(foreach dir,$(NATIVE_DIRS),$(wildcard $(addprefix $(dir),$(PATS))))
-EXTRA_NATIVE_FILES=$(filter-out $(VENDORED_FILES) $(addprefix $(DST_DIR),$(OLLAMA_NATIVE_FILES)), $(ALL_NATIVE_FILES))
+EXTRA_NATIVE_FILES=$(filter-out $(VENDORED_FILES) $(addprefix $(DEST_DIR),$(OLLAMA_NATIVE_FILES)), $(ALL_NATIVE_FILES))
 remove-stale-files:
 	@rm -f $(EXTRA_NATIVE_FILES)

--- a/llama/make/Makefile.test
+++ b/llama/make/Makefile.test
@ -0,0 +1,19 @@
+# Targets to assist in running tests
+
+include make/common-defs.make
+
+test:
+	cd .. && go test ./... 
+
+integration: $(OLLAMA_EXE)
+	cd .. && go test --tags=integration ./integration -v
+
+lint:
+	cd .. && golangci-lint run -v
+
+# Note: in this makefile we error instead of building to allow more fine-grain control of testing flows
+$(OLLAMA_EXE):
+	@echo ""
+	@echo "ERROR: You must build ollama first - use 'make all' to build the ollama binaries"
+	@echo ""
+	@exit 1
--- a/llama/make/common-defs.make
+++ b/llama/make/common-defs.make
@ -21,18 +21,28 @@ export CGO_CXXFLAGS_ALLOW = -mfma|-mf16c
 export HIP_PLATFORM = amd
 export CGO_ENABLED=1

-SRC_DIR := $(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))
-BUILD_DIR = $(SRC_DIR)build/$(OS)-$(ARCH)
-DIST_BASE = $(abspath $(SRC_DIR)/../dist/$(OS)-$(ARCH))
+ifneq ($(OS),windows)
+CCACHE:=$(shell command -v ccache 2>/dev/null || echo "")
+# Relative paths used to avoid tripping over spaces
+# working directory must be <repo>/llama/
+BUILD_DIR = ./build/$(OS)-$(ARCH)
+DIST_BASE = ../dist/$(OS)-$(ARCH)
+RUNNERS_PAYLOAD_DIR = ../build/$(OS)/$(ARCH)
+DIST_OLLAMA_EXE=$(DIST_BASE)/bin/ollama$(EXE_EXT)
+else
+# Absolute paths with cygpath to convert to 8.3 without spaces
+PWD="$(shell pwd)"
+CYGPWD=$(shell cygpath -m -s "$(PWD)")
+BUILD_DIR = $(CYGPWD)/build/$(OS)-$(ARCH)
+DIST_BASE = $(CYGPWD)/../dist/$(OS)-$(ARCH)
+RUNNERS_PAYLOAD_DIR = $(CYGPWD)/../build/$(OS)/$(ARCH)
+DIST_OLLAMA_EXE=$(DIST_BASE)/ollama$(EXE_EXT)
+endif
 DIST_LIB_DIR = $(DIST_BASE)/lib/ollama
 RUNNERS_DIST_DIR = $(DIST_LIB_DIR)/runners
-RUNNERS_PAYLOAD_DIR = $(abspath $(SRC_DIR)/../build/$(OS)/$(ARCH))
 RUNNERS_BUILD_DIR = $(BUILD_DIR)/runners
 DEFAULT_RUNNER := $(if $(and $(filter darwin,$(OS)),$(filter arm64,$(ARCH))),metal,cpu)
 GZIP:=$(shell command -v pigz 2>/dev/null || echo "gzip")
-ifneq ($(OS),windows)
-	CCACHE:=$(shell command -v ccache 2>/dev/null || echo "")
-endif
 VERSION?=$(shell git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")

 # Conditionally enable ccache for cgo builds too
@ -44,14 +54,18 @@ ifneq ($(CCACHE),)
 endif


-# Override in environment space separated to tune GPU runner CPU vector flags
+# Override in environment to tune CPU vector flags
 ifeq ($(ARCH),amd64)
-	GPU_RUNNER_CPU_FLAGS ?= avx
+ifeq ($(origin CUSTOM_CPU_FLAGS),undefined)
+	GPU_RUNNER_CPU_FLAGS=avx
+	GPU_RUNNER_EXTRA_VARIANT=_avx
+else
+	GPU_RUNNER_CPU_FLAGS=$(subst $(comma),$(space),$(CUSTOM_CPU_FLAGS))
+endif
 endif

 ifeq ($(OS),windows)
 	CP := cp
-	SRC_DIR := $(shell cygpath -m -s "$(SRC_DIR)")
 	OBJ_EXT := obj
 	SHARED_EXT := dll
 	EXE_EXT := .exe
@ -63,7 +77,7 @@ ifneq ($(HIP_PATH),)
 	export HIP_PATH
 endif
 else ifeq ($(OS),linux)
-	CP := cp -af
+	CP := cp -df
 	OBJ_EXT := o
 	SHARED_EXT := so
 	SHARED_PREFIX := lib
@ -73,7 +87,7 @@ else
 	OBJ_EXT := o
 	SHARED_EXT := so
 	CPU_FLAG_PREFIX := -m
-	CP := cp -af
+	CP := cp -df
 endif

 COMMON_SRCS := \
@ -82,3 +96,5 @@ COMMON_SRCS := \
 COMMON_HDRS := \
 	$(wildcard *.h) \
 	$(wildcard *.hpp)
+
+OLLAMA_EXE=../ollama$(EXE_EXT)
--- a/llama/make/cuda-v11-defs.make
+++ b/llama/make/cuda-v11-defs.make
@ -0,0 +1,13 @@
+# Common definitions for the various Makefiles which set cuda settings
+# No rules are defined here so this is safe to include at the beginning of other makefiles
+
+ifeq ($(OS),windows)
+	CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown
+	CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null))
+	CUDA_11:=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null)
+	CUDA_11_COMPILER:=$(wildcard $(CUDA_11)/bin/nvcc.exe)
+else ifeq ($(OS),linux)
+	CUDA_PATH?=/usr/local/cuda
+	CUDA_11:=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null)
+	CUDA_11_COMPILER:=$(wildcard $(CUDA_11)/bin/nvcc)
+endif
--- a/llama/make/cuda-v12-defs.make
+++ b/llama/make/cuda-v12-defs.make
@ -0,0 +1,13 @@
+# Common definitions for the various Makefiles which set cuda settings
+# No rules are defined here so this is safe to include at the beginning of other makefiles
+
+ifeq ($(OS),windows)
+	CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown
+	CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null))
+	CUDA_12:=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null)
+	CUDA_12_COMPILER:=$(wildcard $(CUDA_12)/bin/nvcc.exe)
+else ifeq ($(OS),linux)
+	CUDA_PATH?=/usr/local/cuda
+	CUDA_12:=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null)
+	CUDA_12_COMPILER:=$(wildcard $(CUDA_12)/bin/nvcc)
+endif
--- a/llama/make/cuda.make
+++ b/llama/make/cuda.make
@ -11,26 +11,38 @@ GPU_RUNNER_GO_TAGS := cuda cuda$(GPU_RUNNER_VARIANT)
 GPU_RUNNER_DRIVER_LIB_LINK := -lcuda
 GPU_RUNNER_LIBS_SHORT := cublas cudart cublasLt
 GPU_LIB_DIR_WIN = $(GPU_PATH_ROOT_WIN)/bin
-GPU_LIB_DIR_LINUX = $(GPU_PATH_ROOT_LINUX)/lib64
+ifneq ($(GPU_PATH_ROOT_LINUX),)
+GPU_LIB_DIR_LINUX=$(strip $(shell ls -d $(GPU_PATH_ROOT_LINUX)/lib64 2>/dev/null || ls -d $(GPU_PATH_ROOT_LINUX)/lib 2>/dev/null))
+GPU_COMPILER_LINUX = $(GPU_PATH_ROOT_LINUX)/bin/nvcc
+endif
 CGO_EXTRA_LDFLAGS_WIN = -L"$(GPU_PATH_ROOT_WIN)/lib/x64"
 GPU_COMPILER_WIN = $(GPU_PATH_ROOT_WIN)/bin/nvcc
-GPU_COMPILER_LINUX = $(GPU_PATH_ROOT_LINUX)/bin/nvcc
 GPU_COMPILER_CFLAGS_WIN = $(CFLAGS) -D_WIN32_WINNT=0x602
 GPU_COMPILER_CFLAGS_LINUX = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
 GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602
 GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
-GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT)*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
-GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_LIBS))))

-ifeq ($(OS),linux)
+ifeq ($(OS),windows)
+	# On windows, nvcc uses msvc which does not support avx512vbmi avx512vnni avx512bf16, but macros can turn them on
+	GPU_VECTOR_FLAGS=$(call uc,$(filter-out avx512bf16,$(filter-out avx512vnni,$(filter-out avx512vbmi,$(GPU_RUNNER_CPU_FLAGS)))))
+	GPU_COMPILER_EXTRA_FLAGS=$(if $(filter avx512vbmi,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512VBMI__)
+	GPU_COMPILER_EXTRA_FLAGS+=$(if $(filter avx512vnni,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512VNNI__)
+	GPU_COMPILER_EXTRA_FLAGS+=$(if $(filter avx512bf16,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512BF16__)
+	GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
+else ifeq ($(OS),linux)
+	# On linux, nvcc requires avx512 -> -mavx512f -mavx512dq -mavx512bw
+	GPU_VECTOR_FLAGS=$(if $(filter avx512,$(GPU_RUNNER_CPU_FLAGS)),avx512f avx512dq avx512bw) $(filter-out avx512,$(GPU_RUNNER_CPU_FLAGS))
 	CUDA_PATH?=/usr/local/cuda
-	GPU_COMPILER_FPIC = -fPIC -Wno-unused-function -std=c++11
+	GPU_COMPILER_EXTRA_FLAGS = -fPIC -Wno-unused-function -std=c++11
+	GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
 endif
+GPU_DIST_LIB_DEPS= $(sort $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
+
 GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(CUDA_ARCHITECTURES)),--generate-code=arch=compute_$(arch)$(comma)code=[compute_$(arch)$(comma)sm_$(arch)]) \
 	-DGGML_CUDA_USE_GRAPHS=1
 GPU_COMPILER_CUFLAGS = \
-	$(GPU_COMPILER_FPIC) \
-	-Xcompiler "$(addprefix $(CPU_FLAG_PREFIX),$(_OS_GPU_RUNNER_CPU_FLAGS))" \
+	$(GPU_COMPILER_EXTRA_FLAGS) \
+	-Xcompiler "$(addprefix $(CPU_FLAG_PREFIX),$(GPU_VECTOR_FLAGS))" \
 	-t2 \
 	-DGGML_CUDA_DMMV_X=32 \
 	-DGGML_CUDA_MMV_Y=1 \
--- a/llama/make/gpu.make
+++ b/llama/make/gpu.make
@ -25,14 +25,8 @@ GPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERS
 # today, cuda is bundled, but rocm is split out.  Should split them each out by runner
 DIST_GPU_RUNNER_DEPS_DIR = $(DIST_LIB_DIR)

-ifeq ($(OS),windows)
-	_OS_GPU_RUNNER_CPU_FLAGS=$(call uc,$(GPU_RUNNER_CPU_FLAGS))
-else ifeq ($(OS),linux)
-	_OS_GPU_RUNNER_CPU_FLAGS=$(GPU_RUNNER_CPU_FLAGS)
-endif

 GPU_RUNNER_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))
-DIST_GPU_RUNNER_LIB_DEPS = $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_RUNNER_LIBS)))

 GPU_RUNNER_SRCS := \
 	ggml-cuda.cu \
@ -60,14 +54,18 @@ GPU_RUNNER_OBJS := $(GPU_RUNNER_SRCS:.cu=.$(GPU_RUNNER_NAME).$(OBJ_EXT))
 GPU_RUNNER_OBJS := $(GPU_RUNNER_OBJS:.c=.$(GPU_RUNNER_NAME).$(OBJ_EXT))
 GPU_RUNNER_OBJS := $(addprefix $(BUILD_DIR)/,$(GPU_RUNNER_OBJS:.cpp=.$(GPU_RUNNER_NAME).$(OBJ_EXT)))

-DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)))
+DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)))
 ifneq ($(OS),windows)
-PAYLOAD_RUNNERS = $(addprefix $(RUNNERS_PAYLOAD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT).gz,$(GPU_RUNNER_NAME)))
+PAYLOAD_RUNNERS = $(addprefix $(RUNNERS_PAYLOAD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT).gz,$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)))
 endif
-BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)))
+BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)))


-$(GPU_RUNNER_NAME): $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
+$(GPU_RUNNER_NAME): $(BUILD_RUNNERS) 
+
+dist: $(DIST_RUNNERS)
+
+payload: $(PAYLOAD_RUNNERS)

 # Build targets
 $(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cu
@ -79,11 +77,11 @@ $(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.c
 $(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cpp
 	@-mkdir -p $(dir $@)
 	$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CXXFLAGS) -o $@ $<
-$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/" $(CGO_EXTRA_LDFLAGS)
-$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
+$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/" $(CGO_EXTRA_LDFLAGS)
+$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(BUILD_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
 	@-mkdir -p $(dir $@)
-	GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie  $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner
-$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(DIST_GPU_RUNNER_LIB_DEPS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
+	GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner
+$(BUILD_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
 	@-mkdir -p $(dir $@)
 	$(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@

@ -91,22 +89,19 @@ $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).
 $(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
 	@-mkdir -p $(dir $@)
 	$(CP) $< $@
-$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) $(GPU_DIST_DEPS_LIBS)
-$(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT)
+$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) $(GPU_DIST_LIB_DEPS)
+$(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(BUILD_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT)
 	@-mkdir -p $(dir $@)
 	$(CP) $< $@
-$(DIST_GPU_RUNNER_LIB_DEPS): 
+$(GPU_DIST_LIB_DEPS):
 	@-mkdir -p $(dir $@)
 	$(CP) $(GPU_LIB_DIR)/$(notdir $@) $(dir $@)
-$(GPU_DIST_DEPS_LIBS): 
-	@-mkdir -p $(dir $@)
-	$(CP) $(dir $(filter %$(notdir $@),$(GPU_LIBS) $(GPU_TRANSITIVE_LIBS)))/$(notdir $@) $(dir $@)

 # Payload targets
 $(RUNNERS_PAYLOAD_DIR)/%/ollama_llama_server.gz: $(RUNNERS_BUILD_DIR)/%/ollama_llama_server 
 	@-mkdir -p $(dir $@)
 	${GZIP} --best -c $< > $@
-$(RUNNERS_PAYLOAD_DIR)/$(GPU_RUNNER_NAME)/%.gz: $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/%
+$(RUNNERS_PAYLOAD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/%.gz: $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/%
 	@-mkdir -p $(dir $@)
 	${GZIP} --best -c $< > $@

--- a/llama/runner/requirements.go
+++ b/llama/runner/requirements.go
@ -1,19 +0,0 @@
-package main
-
-import (
-	"encoding/json"
-	"os"
-
-	"github.com/ollama/ollama/llama"
-	"github.com/ollama/ollama/version"
-)
-
-func printRequirements(fp *os.File) {
-	attrs := map[string]string{
-		"system_info":  llama.PrintSystemInfo(),
-		"version":      version.Version,
-		"cpu_features": llama.CpuFeatures,
-	}
-	enc := json.NewEncoder(fp)
-	_ = enc.Encode(attrs)
-}
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@ -818,13 +818,8 @@ func main() {
 	mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
 	tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
 	multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
-	requirements := flag.Bool("requirements", false, "print json requirement information")

 	flag.Parse()
-	if *requirements {
-		printRequirements(os.Stdout)
-		return
-	}
 	level := slog.LevelInfo
 	if *verbose {
 		level = slog.LevelDebug
--- a/runners/common.go
+++ b/runners/common.go
@ -17,6 +17,7 @@ import (
 	"syscall"

 	"golang.org/x/sync/errgroup"
+	"golang.org/x/sys/cpu"

 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
@ -51,14 +52,18 @@ func Refresh(payloadFS fs.FS) (string, error) {
 		}()
 	}

-	if hasPayloads(payloadFS) {
-		if runnersDir == "" {
-			runnersDir, err = extractRunners(payloadFS)
-		} else {
-			err = refreshRunners(payloadFS, runnersDir)
+	// avoid payloads if we're operating off a local build
+	d, err := locateRunners()
+	if err != nil {
+		if hasPayloads(payloadFS) {
+			if runnersDir == "" {
+				runnersDir, err = extractRunners(payloadFS)
+			} else {
+				err = refreshRunners(payloadFS, runnersDir)
+			}
 		}
-	} else if runnersDir == "" {
-		runnersDir, err = locateRunners()
+	} else {
+		runnersDir = d
 	}

 	return runnersDir, err
@ -78,31 +83,23 @@ func Cleanup(payloadFS fs.FS) {
 	}
 }

+// locateRunners searches for runners in a prioritized set of locations
+// 1. local build, with executable at the top of the tree
+// 2. lib directory relative to executable
+// 3. payload extracted to OLLAMA_TMPDIR (this routine returns an error)
 func locateRunners() (string, error) {
 	exe, err := os.Executable()
 	if err != nil {
 		return "", err
 	}

-	cwd, err := os.Getwd()
-	if err != nil {
-		return "", err
+	paths := []string{
+		filepath.Join(filepath.Dir(exe), "llama", "build", runtime.GOOS+"-"+runtime.GOARCH, "runners"),
+		filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama", "runners"),
 	}
-
-	var paths []string
-	for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe()), cwd} {
-		paths = append(paths,
-			root,
-			filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
-			filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
-		)
-	}
-
-	// Try a few variations to improve developer experience when building from source in the local tree
 	for _, path := range paths {
-		candidate := filepath.Join(path, "lib", "ollama", "runners")
-		if _, err := os.Stat(candidate); err == nil {
-			return candidate, nil
+		if _, err := os.Stat(path); err == nil {
+			return path, nil
 		}
 	}
 	return "", fmt.Errorf("unable to locate runners in any search path %v", paths)
@ -292,7 +289,16 @@ func GetAvailableServers(payloadsDir string) map[string]string {
 	servers := make(map[string]string)
 	for _, file := range files {
 		slog.Debug("availableServers : found", "file", file)
-		servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
+		runnerName := filepath.Base(filepath.Dir(file))
+		// Special case for our GPU runners - if compiled with standard AVX flag
+		// detect incompatible system
+		// Custom builds will omit this and its up to the user to ensure compatibility
+		parsed := strings.Split(runnerName, "_")
+		if len(parsed) == 3 && parsed[2] == "avx" && !cpu.X86.HasAVX {
+			slog.Info("GPU runner incompatible with host system, CPU does not have AVX", "runner", runnerName)
+			continue
+		}
+		servers[runnerName] = filepath.Dir(file)
 	}

 	return servers
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@ -82,7 +82,7 @@ function buildOllama() {
    if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) {
        write-host "Building ollama runners"
        Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
-        & make -C llama -j 12
+        & make -C llama -j 12 dist
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    } else {
        write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"