diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index a0f5e4d4..37327a6a 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -85,7 +85,7 @@ jobs:
           import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
           Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
           if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make
+          make dist
         name: make
       - uses: actions/upload-artifact@v4
         with:
@@ -143,8 +143,8 @@ jobs:
           import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
           Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
           if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make -C llama print-HIP_PATH print-HIP_LIB_DIR
-          make rocm
+          make help-runners
+          make dist_rocm
       - uses: actions/upload-artifact@v4
         with:
           name: generate-windows-rocm
@@ -226,7 +226,7 @@ jobs:
           import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
           Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
           if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make cuda_v$(($env:CUDA_PATH | split-path -leaf) -replace 'v(\d+).*', '$1')
+          make dist_cuda_v$(($env:CUDA_PATH | split-path -leaf) -replace 'v(\d+).*', '$1')
       - uses: actions/upload-artifact@v4
         with:
           name: generate-windows-cuda-${{ matrix.cuda.version }}
diff --git a/Dockerfile b/Dockerfile
index ca09325c..9750c74e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,9 +1,7 @@
 ARG GOLANG_VERSION=1.22.8
 ARG CMAKE_VERSION=3.22.1
 ARG CUDA_VERSION_11=11.3.1
-ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
 ARG CUDA_VERSION_12=12.4.0
-ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
 ARG ROCM_VERSION=6.1.2
 ARG JETPACK_6=r36.2.0
 ARG JETPACK_5=r35.4.1
@@ -62,7 +60,7 @@ RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/
 ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH:/usr/local/cuda/bin
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
 ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
-ENV GOARCH amd64
+ENV GOARCH arm64
 ENV CGO_ENABLED 1
 WORKDIR /go/src/github.com/ollama/ollama/
 ENTRYPOINT [ "zsh" ]
@@ -70,29 +68,21 @@ ENTRYPOINT [ "zsh" ]
 FROM --platform=linux/amd64 unified-builder-amd64 AS runners-amd64
 COPY . .
 ARG OLLAMA_SKIP_CUDA_GENERATE
-ARG OLLAMA_SKIP_CUDA_11_GENERATE
-ARG OLLAMA_SKIP_CUDA_12_GENERATE
 ARG OLLAMA_SKIP_ROCM_GENERATE
-ARG CUDA_V11_ARCHITECTURES
-ARG CUDA_V12_ARCHITECTURES
 ARG OLLAMA_FAST_BUILD
 RUN --mount=type=cache,target=/root/.ccache \
     if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
-        make -j $(expr $(nproc) / 2 ) ; \
+        make -j $(expr $(nproc) / 2 ) dist payload ; \
     else \
-        make -j 5 ; \
+        make -j 5 dist payload ; \
     fi
 
 FROM --platform=linux/arm64 unified-builder-arm64 AS runners-arm64
 COPY . .
 ARG OLLAMA_SKIP_CUDA_GENERATE
-ARG OLLAMA_SKIP_CUDA_11_GENERATE
-ARG OLLAMA_SKIP_CUDA_12_GENERATE
-ARG CUDA_V11_ARCHITECTURES
-ARG CUDA_V12_ARCHITECTURES
 ARG OLLAMA_FAST_BUILD
 RUN --mount=type=cache,target=/root/.ccache \
-    make -j 5
+    make -j 5 dist payload
 
 # Jetsons need to be built in discrete stages
 FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS runners-jetpack5-arm64
@@ -107,7 +97,7 @@ COPY . .
 ARG CGO_CFLAGS
 ENV GOARCH arm64
 RUN --mount=type=cache,target=/root/.ccache \
-    make -j 5 cuda_v11 \
+    make -j 5 dist_cuda_v11 payload_cuda_v11 \
         CUDA_ARCHITECTURES="72;87" \
         GPU_RUNNER_VARIANT=_jetpack5 \
         CGO_EXTRA_LDFLAGS_LINUX=-L/usr/local/cuda/lib64/stubs \
@@ -126,7 +116,7 @@ COPY . .
 ARG CGO_CFLAGS
 ENV GOARCH arm64
 RUN --mount=type=cache,target=/root/.ccache \
-    make -j 5 cuda_v12 \
+    make -j 5 dist_cuda_v12 payload_cuda_v12 \
         CUDA_ARCHITECTURES="87" \
         GPU_RUNNER_VARIANT=_jetpack6 \
         CGO_EXTRA_LDFLAGS_LINUX=-L/usr/local/cuda/lib64/stubs \
diff --git a/docs/development.md b/docs/development.md
index 3520dc79..202c867e 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -3,7 +3,7 @@
 Install required tools:
 
 - go version 1.22 or higher
-- OS specific native compiler (see below)
+- OS specific C/C++ compiler (see below)
 - GNU Make
 
 
@@ -69,16 +69,6 @@ make -j 5
 
 ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
 
-#### Advanced CPU Settings
-
-By default, running `make` will compile a few different variations
-of the LLM library based on common CPU families and vector math capabilities,
-including a lowest-common-denominator which should run on almost any 64 bit CPU
-somewhat slowly. At runtime, Ollama will auto-detect the optimal variation to
-load. 
-
-Custom CPU settings are not currently supported in the new Go server build but will be added back after we complete the transition.
-
 #### Containerized Linux Build
 
 If you have Docker and buildx available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting artifacts are placed in `./dist`  and by default the script builds both arm64 and amd64 binaries.  If you want to build only amd64, you can build with `PLATFORM=linux/amd64 ./scripts/build_linux.sh`
@@ -142,3 +132,30 @@ pacman -S mingw-w64-clang-aarch64-clang mingw-w64-clang-aarch64-gcc-compat mingw
 ```
 
 You will need to ensure your PATH includes go, cmake, gcc and clang mingw32-make to build ollama from source. (typically `C:\msys64\clangarm64\bin\`)
+
+
+## Advanced CPU Vector Settings
+
+On x86, running `make` will compile several CPU runners which can run on different CPU families. At runtime, Ollama will auto-detect the best variation to load.  If GPU libraries are present at build time, Ollama also compiles GPU runners with the `AVX` CPU vector feature enabled.  This provides a good performance balance when loading large models that split across GPU and CPU with broad compatibility.  Some users may prefer no vector extensions (e.g. older Xeon/Celeron processors, or hypervisors that mask the vector features) while other users may prefer turning on many more vector extensions to further improve performance for split model loads.
+
+To customize the set of CPU vector features enabled for a CPU runner and all GPU runners, use CUSTOM_CPU_FLAGS during the build.
+
+To build without any vector flags:
+
+```
+make CUSTOM_CPU_FLAGS=""
+```
+
+To build with both AVX and AVX2:
+```
+make CUSTOM_CPU_FLAGS=avx,avx2
+```
+
+To build with AVX512 features turned on:
+
+```
+make CUSTOM_CPU_FLAGS=avx,avx2,avx512,avx512vbmi,avx512vnni,avx512bf16
+```
+
+> [!NOTE]  
+> If you are experimenting with different flags, make sure to do a `make clean` between each change to ensure everything is rebuilt with the new compiler flags
diff --git a/llama/Makefile b/llama/Makefile
index 43fe069c..6b04312c 100644
--- a/llama/Makefile
+++ b/llama/Makefile
@@ -4,26 +4,20 @@ include make/common-defs.make
 RUNNER_TARGETS := default
 
 # Determine which if any GPU runners we should build
+include make/cuda-v11-defs.make
+include make/cuda-v12-defs.make
+
 ifeq ($(OS),windows)
-	CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown
-	CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null))
-	CUDA_11:=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null)
-	CUDA_11_COMPILER:=$(wildcard $(CUDA_11)/bin/nvcc.exe)
-	CUDA_12:=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null)
-	CUDA_12_COMPILER:=$(wildcard $(CUDA_12)/bin/nvcc.exe)
 	HIP_LIB_DIR := $(shell ls -d $(HIP_PATH)/lib 2>/dev/null)
 	HIP_COMPILER:=$(wildcard $(HIP_PATH)/bin/hipcc.bin.exe)
 else ifeq ($(OS),linux)
-	CUDA_PATH?=/usr/local/cuda
-	CUDA_11:=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null)
-	CUDA_11_COMPILER:=$(wildcard $(CUDA_11)/bin/nvcc)
-	CUDA_12:=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null)
-	CUDA_12_COMPILER:=$(wildcard $(CUDA_11)/bin/nvcc)
 	HIP_PATH?=/opt/rocm
 	HIP_LIB_DIR := $(shell ls -d $(HIP_PATH)/lib 2>/dev/null)
 	HIP_COMPILER:=$(wildcard $(HIP_PATH)/bin/hipcc)
 endif
 
+# Without CUSTOM_CPU_FLAGS we default to build both v11 and v12 if present
+ifeq ($(CUSTOM_CPU_FLAGS),)
 ifeq ($(OLLAMA_SKIP_CUDA_GENERATE),)
 ifneq ($(CUDA_11_COMPILER),)
 	RUNNER_TARGETS += cuda_v11
@@ -32,6 +26,14 @@ ifneq ($(CUDA_12_COMPILER),)
 	RUNNER_TARGETS += cuda_v12
 endif
 endif
+else # CUSTOM_CPU_FLAGS is set, we'll build only the latest cuda version detected
+ifneq ($(CUDA_12),)
+	RUNNER_TARGETS += cuda_v12
+else ifneq ($(CUDA_11),)
+	RUNNER_TARGETS += cuda_v11
+endif
+endif
+
 ifeq ($(OLLAMA_SKIP_ROCM_GENERATE),)
 ifneq ($(HIP_COMPILER),)
 	RUNNER_TARGETS += rocm
@@ -46,10 +48,10 @@ dist: $(addprefix dist_, $(RUNNER_TARGETS))
 dist_%:
 	@$(MAKE) --no-print-directory -f make/Makefile.$* dist
 
-payload: clean-payload .WAIT $(addprefix payload_, $(RUNNER_TARGETS))
+payload: $(addprefix payload_, $(RUNNER_TARGETS))
 
 payload_%:
-	@$(MAKE) --no-print-directory f make/Makefile.$* dist
+	@$(MAKE) --no-print-directory -f make/Makefile.$* payload
 
 runners: $(RUNNER_TARGETS)
 
@@ -65,8 +67,8 @@ help-sync apply-patches create-patches sync:
 test integration lint:
 	@$(MAKE) --no-print-directory -f make/Makefile.test $@
 
-clean:
-	rm -rf $(BUILD_DIR) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
+clean: clean-payload
+	rm -rf "$(BUILD_DIR)" "$(DIST_LIB_DIR)"
 	go clean -cache
 
 clean-payload:
@@ -75,9 +77,9 @@ clean-payload:
 help:
 	@echo "The following make targets will help you BUILD Ollama"
 	@echo ""
-	@echo "	make all   		# (default) Build Ollama llm subprocess runners, and the primary ollama executable"
+	@echo "	make all   		# (default target) Build Ollama llm subprocess runners, and the primary ollama executable"
 	@echo "	make runners		# Build Ollama llm subprocess runners; after you may use 'go build .' to build the primary ollama exectuable"
-	@echo "	make <runner>		# Build specific runners. Enabled: $(RUNNER_TARGETS)"
+	@echo "	make <runner>		# Build specific runners. Enabled: '$(RUNNER_TARGETS)'"
 	@echo "	make payload		# Build the runners as payloads (Linux/Mac only)"
 	@echo "	make dist		# Build the runners for distribution and gather dependencies"
 	@echo "	make help-sync 		# Help information on vendor update targets"
@@ -94,9 +96,11 @@ help:
 
 
 help-runners:
-	@echo "The following runners will be built based on discovered GPU libraries: $(RUNNER_TARGETS)"
+	@echo "The following runners will be built based on discovered GPU libraries: '$(RUNNER_TARGETS)'"
 	@echo "(On MacOS arm64 'default' is the metal runner.  For all other platforms 'default' is one or more CPU runners)"
 	@echo ""
+	@echo "GPU Runner CPU Flags: '$(GPU_RUNNER_CPU_FLAGS)'  (Override with CUSTOM_CPU_FLAGS)"
+	@echo ""
 	@echo "# CUDA_PATH sets the location where CUDA toolkits are present"
 	@echo "CUDA_PATH=$(CUDA_PATH)"
 	@echo "	CUDA_11=$(CUDA_11)"
@@ -108,7 +112,7 @@ help-runners:
 	@echo "HIP_PATH=$(HIP_PATH)"
 	@echo "	HIP_COMPILER=$(HIP_COMPILER)"
 
-.PHONY: all exe dist payload help help-sync help-runners test integration lint runners clean clean-payload $(RUNNER_TARGETS) $(addprefix dist_, $(RUNNER_TARGETS)) $(addprefix payload_, $(RUNNER_TARGETS)) .WAIT
+.PHONY: all exe dist payload help help-sync help-runners test integration lint runners clean clean-payload $(RUNNER_TARGETS)
 
 # Handy debugging for make variables
 print-%:
diff --git a/llama/llama.go b/llama/llama.go
index dbb02768..7861b69b 100644
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -9,17 +9,19 @@ package llama
 #cgo amd64,avx CXXFLAGS: -mavx
 #cgo amd64,avx2 CFLAGS: -mavx2 -mfma
 #cgo amd64,avx2 CXXFLAGS: -mavx2 -mfma
+#cgo amd64,avx512 CFLAGS: -mavx512f -mavx512dq -mavx512bw
+#cgo amd64,avx512 CXXFLAGS: -mavx512f -mavx512dq -mavx512bw
+#cgo amd64,avx512bf16 CFLAGS: -mavx512bf16 -D__AVX512BF16__
+#cgo amd64,avx512bf16 CXXFLAGS: -mavx512bf16 -D__AVX512BF16__
+#cgo amd64,avx512vbmi CFLAGS: -mavx512vbmi -D__AVX512VBMI__
+#cgo amd64,avx512vbmi CXXFLAGS: -mavx512vbmi -D__AVX512VBMI__
+#cgo amd64,avx512vnni CFLAGS: -mavx512vnni -D__AVX512VNNI__
+#cgo amd64,avx512vnni CXXFLAGS: -mavx512vnni -D__AVX512VNNI__
 #cgo amd64,f16c CFLAGS: -mf16c
 #cgo amd64,f16c CXXFLAGS: -mf16c
 #cgo amd64,fma CFLAGS: -mfma
 #cgo amd64,fma CXXFLAGS: -mfma
-#cgo avx CFLAGS: -mavx
-#cgo avx CXXFLAGS: -mavx
-#cgo avx2 CFLAGS: -mavx2 -mfma -mf16c
-#cgo avx2 CXXFLAGS: -mavx2 -mfma -mf16c
 #cgo cuda CFLAGS: -fPIE -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
-#cgo cuda CFLAGS: -fPIE -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
-#cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 #cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 #cgo cuda_jetpack5 LDFLAGS: -lggml_cuda_jetpack5 -L/usr/local/cuda-11/lib64
 #cgo cuda_jetpack6 LDFLAGS: -lggml_cuda_jetpack6 -L/usr/local/cuda-12/lib64
@@ -37,7 +39,6 @@ package llama
 #cgo linux CFLAGS: -D_GNU_SOURCE
 #cgo linux CXXFLAGS: -D_GNU_SOURCE
 #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
-#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
 #cgo linux,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
 #cgo linux,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
 #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/Linux/arm64
@@ -50,14 +51,11 @@ package llama
 #cgo rocm LDFLAGS: -L${SRCDIR} -lggml_rocm -lhipblas -lamdhip64 -lrocblas
 #cgo windows CFLAGS: -Wno-discarded-qualifiers -D_WIN32_WINNT=0x602
 #cgo windows CXXFLAGS: -D_WIN32_WINNT=0x602
-#cgo windows LDFLAGS: -lmsvcrt
 #cgo windows LDFLAGS: -lmsvcrt -static-libstdc++ -static-libgcc -static
 #cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/Windows/amd64
-#cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/Windows/amd64
 #cgo windows,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
 #cgo windows,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
 #cgo windows,arm64 LDFLAGS: -L${SRCDIR}/build/Windows/arm64
-#cgo windows,arm64 LDFLAGS: -L${SRCDIR}/build/Windows/arm64
 #cgo windows,cuda LDFLAGS: -lcuda -lcudart -lcublas -lcublasLt
 #cgo windows,rocm LDFLAGS: -lggml_rocm -lhipblas -lamdhip64 -lrocblas
 
diff --git a/llama/make/Makefile.cuda_v11 b/llama/make/Makefile.cuda_v11
index 528e0efe..ab5e72e4 100644
--- a/llama/make/Makefile.cuda_v11
+++ b/llama/make/Makefile.cuda_v11
@@ -1,7 +1,7 @@
 # Build rules for CUDA v11 runner
 
 include make/common-defs.make
-
+include make/cuda-v11-defs.make
 
 GPU_RUNNER_VARIANT := _v11
 GPU_PATH_ROOT_WIN=$(shell ls -d $(dir $(shell cygpath -m -s "$(CUDA_PATH)\.."))/v11.? 2>/dev/null)
diff --git a/llama/make/Makefile.cuda_v12 b/llama/make/Makefile.cuda_v12
index 2418ef00..63fd48a8 100644
--- a/llama/make/Makefile.cuda_v12
+++ b/llama/make/Makefile.cuda_v12
@@ -1,7 +1,7 @@
 # Build rules for CUDA v12 runner
 
 include make/common-defs.make
-
+include make/cuda-v12-defs.make
 
 GPU_RUNNER_VARIANT := _v12
 GPU_PATH_ROOT_WIN=$(shell ls -d $(dir $(shell cygpath -m -s "$(CUDA_PATH)\.."))/v12.? 2>/dev/null)
diff --git a/llama/make/Makefile.default b/llama/make/Makefile.default
index 16011b1b..53026bfc 100644
--- a/llama/make/Makefile.default
+++ b/llama/make/Makefile.default
@@ -8,7 +8,7 @@ CPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERS
 DEFAULT_RUNNER := $(if $(and $(filter darwin,$(OS)),$(filter arm64,$(ARCH))),metal,cpu)
 RUNNERS := $(DEFAULT_RUNNER)
 ifeq ($(ARCH),amd64)
-ifeq ($(CUSTOM_CPU_FLAGS),)
+ifeq ($(origin CUSTOM_CPU_FLAGS),undefined)
 	RUNNERS += cpu_avx cpu_avx2
 endif
 endif
@@ -51,7 +51,7 @@ $(RUNNERS_PAYLOAD_DIR)/%/ollama_llama_server$(EXE_EXT).gz: $(RUNNERS_BUILD_DIR)/
 clean: 
 	rm -f $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
 
-.PHONY: clean default
+.PHONY: clean default dist
 
 # Handy debugging for make variables
 print-%:
diff --git a/llama/make/Makefile.rocm b/llama/make/Makefile.rocm
index 8697fe43..79b5e7c5 100644
--- a/llama/make/Makefile.rocm
+++ b/llama/make/Makefile.rocm
@@ -56,9 +56,12 @@ else ifeq ($(OS),windows)
 endif
 GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(HIP_ARCHS)),--offload-arch=$(arch))
 
+# HIPCC uses clang which requires avx512 -> -mavx512f -mavx512dq -mavx512bw
+GPU_VECTOR_FLAGS=$(if $(filter avx512,$(GPU_RUNNER_CPU_FLAGS)),avx512f avx512dq avx512bw) $(filter-out avx512,$(GPU_RUNNER_CPU_FLAGS))
+
 GPU_COMPILER_CUFLAGS = \
 	$(GPU_COMPILER_FPIC) \
-	$(addprefix -m,$(GPU_RUNNER_CPU_FLAGS)) \
+	$(addprefix -m,$(GPU_VECTOR_FLAGS)) \
 	-mf16c \
 	-mfma \
 	-parallel-jobs=2 \
@@ -102,7 +105,7 @@ endif
 include make/gpu.make
 
 # Adjust the rules from gpu.make to handle the ROCm dependencies properly
-$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(ROCBLAS_DIST_DEP_MANIFEST)
+$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(ROCBLAS_DIST_DEP_MANIFEST)
 $(ROCBLAS_DIST_DEP_MANIFEST):
 	@-mkdir -p $(dir $@)
 	@echo "Copying rocblas library..."
diff --git a/llama/make/Makefile.test b/llama/make/Makefile.test
index e6423720..9b70e934 100644
--- a/llama/make/Makefile.test
+++ b/llama/make/Makefile.test
@@ -11,7 +11,7 @@ integration: $(OLLAMA_EXE)
 lint:
 	cd $(abspath $(SRC_DIR)/..) && golangci-lint run -v
 
-# Note: in this makefile we error instead of building to allow more fine-grain control if testing flows
+# Note: in this makefile we error instead of building to allow more fine-grain control of testing flows
 $(OLLAMA_EXE):
 	@echo ""
 	@echo "ERROR: You must build ollama first - use 'make all' to build the ollama binaries"
diff --git a/llama/make/common-defs.make b/llama/make/common-defs.make
index 006c6001..77f8bca3 100644
--- a/llama/make/common-defs.make
+++ b/llama/make/common-defs.make
@@ -44,9 +44,14 @@ ifneq ($(CCACHE),)
 endif
 
 
-# Override in environment space separated to tune GPU runner CPU vector flags
+# Override in environment to tune CPU vector flags
 ifeq ($(ARCH),amd64)
-	GPU_RUNNER_CPU_FLAGS ?= avx
+ifeq ($(origin CUSTOM_CPU_FLAGS),undefined)
+	GPU_RUNNER_CPU_FLAGS=avx
+	GPU_RUNNER_EXTRA_VARIANT=_avx
+else
+	GPU_RUNNER_CPU_FLAGS=$(subst $(comma),$(space),$(CUSTOM_CPU_FLAGS))
+endif
 endif
 
 ifeq ($(OS),windows)
diff --git a/llama/make/cuda-v11-defs.make b/llama/make/cuda-v11-defs.make
new file mode 100644
index 00000000..71a21edd
--- /dev/null
+++ b/llama/make/cuda-v11-defs.make
@@ -0,0 +1,13 @@
+# Common definitions for the various Makefiles which set cuda settings
+# No rules are defined here so this is safe to include at the beginning of other makefiles
+
+ifeq ($(OS),windows)
+	CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown
+	CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null))
+	CUDA_11:=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null)
+	CUDA_11_COMPILER:=$(wildcard $(CUDA_11)/bin/nvcc.exe)
+else ifeq ($(OS),linux)
+	CUDA_PATH?=/usr/local/cuda
+	CUDA_11:=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null)
+	CUDA_11_COMPILER:=$(wildcard $(CUDA_11)/bin/nvcc)
+endif
diff --git a/llama/make/cuda-v12-defs.make b/llama/make/cuda-v12-defs.make
new file mode 100644
index 00000000..45fd47a7
--- /dev/null
+++ b/llama/make/cuda-v12-defs.make
@@ -0,0 +1,13 @@
+# Common definitions for the various Makefiles which set cuda settings
+# No rules are defined here so this is safe to include at the beginning of other makefiles
+
+ifeq ($(OS),windows)
+	CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown
+	CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null))
+	CUDA_12:=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null)
+	CUDA_12_COMPILER:=$(wildcard $(CUDA_12)/bin/nvcc.exe)
+else ifeq ($(OS),linux)
+	CUDA_PATH?=/usr/local/cuda
+	CUDA_12:=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null)
+	CUDA_12_COMPILER:=$(wildcard $(CUDA_12)/bin/nvcc)
+endif
diff --git a/llama/make/cuda.make b/llama/make/cuda.make
index 7a4b1036..aa5eb060 100644
--- a/llama/make/cuda.make
+++ b/llama/make/cuda.make
@@ -11,26 +11,38 @@ GPU_RUNNER_GO_TAGS := cuda cuda$(GPU_RUNNER_VARIANT)
 GPU_RUNNER_DRIVER_LIB_LINK := -lcuda
 GPU_RUNNER_LIBS_SHORT := cublas cudart cublasLt
 GPU_LIB_DIR_WIN = $(GPU_PATH_ROOT_WIN)/bin
-GPU_LIB_DIR_LINUX = $(GPU_PATH_ROOT_LINUX)/lib64
+ifneq ($(GPU_PATH_ROOT_LINUX),)
+GPU_LIB_DIR_LINUX=$(strip $(shell ls -d $(GPU_PATH_ROOT_LINUX)/lib64 2>/dev/null || ls -d $(GPU_PATH_ROOT_LINUX)/lib 2>/dev/null))
+GPU_COMPILER_LINUX = $(GPU_PATH_ROOT_LINUX)/bin/nvcc
+endif
 CGO_EXTRA_LDFLAGS_WIN = -L"$(GPU_PATH_ROOT_WIN)/lib/x64"
 GPU_COMPILER_WIN = $(GPU_PATH_ROOT_WIN)/bin/nvcc
-GPU_COMPILER_LINUX = $(GPU_PATH_ROOT_LINUX)/bin/nvcc
 GPU_COMPILER_CFLAGS_WIN = $(CFLAGS) -D_WIN32_WINNT=0x602
 GPU_COMPILER_CFLAGS_LINUX = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
 GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602
 GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
-GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT)*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
+
+ifeq ($(OS),windows)
+	# On windows, nvcc uses msvc which does not support avx512vbmi avx512vnni avx512bf16, but macros can turn them on
+	GPU_VECTOR_FLAGS=$(call uc,$(filter-out avx512bf16,$(filter-out avx512vnni,$(filter-out avx512vbmi,$(GPU_RUNNER_CPU_FLAGS)))))
+	GPU_COMPILER_EXTRA_FLAGS=$(if $(filter avx512vbmi,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512VBMI__)
+	GPU_COMPILER_EXTRA_FLAGS+=$(if $(filter avx512vnni,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512VNNI__)
+	GPU_COMPILER_EXTRA_FLAGS+=$(if $(filter avx512bf16,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512BF16__)
+	GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
+else ifeq ($(OS),linux)
+	# On linux, nvcc requires avx512 -> -mavx512f -mavx512dq -mavx512bw
+	GPU_VECTOR_FLAGS=$(if $(filter avx512,$(GPU_RUNNER_CPU_FLAGS)),avx512f avx512dq avx512bw) $(filter-out avx512,$(GPU_RUNNER_CPU_FLAGS))
+	CUDA_PATH?=/usr/local/cuda
+	GPU_COMPILER_EXTRA_FLAGS = -fPIC -Wno-unused-function -std=c++11
+	GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
+endif
 GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
 
-ifeq ($(OS),linux)
-	CUDA_PATH?=/usr/local/cuda
-	GPU_COMPILER_FPIC = -fPIC -Wno-unused-function -std=c++11
-endif
 GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(CUDA_ARCHITECTURES)),--generate-code=arch=compute_$(arch)$(comma)code=[compute_$(arch)$(comma)sm_$(arch)]) \
 	-DGGML_CUDA_USE_GRAPHS=1
 GPU_COMPILER_CUFLAGS = \
-	$(GPU_COMPILER_FPIC) \
-	-Xcompiler "$(addprefix $(CPU_FLAG_PREFIX),$(_OS_GPU_RUNNER_CPU_FLAGS))" \
+	$(GPU_COMPILER_EXTRA_FLAGS) \
+	-Xcompiler "$(addprefix $(CPU_FLAG_PREFIX),$(GPU_VECTOR_FLAGS))" \
 	-t2 \
 	-DGGML_CUDA_DMMV_X=32 \
 	-DGGML_CUDA_MMV_Y=1 \
diff --git a/llama/make/gpu.make b/llama/make/gpu.make
index b76af061..39354fb9 100644
--- a/llama/make/gpu.make
+++ b/llama/make/gpu.make
@@ -25,14 +25,8 @@ GPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERS
 # today, cuda is bundled, but rocm is split out.  Should split them each out by runner
 DIST_GPU_RUNNER_DEPS_DIR = $(DIST_LIB_DIR)
 
-ifeq ($(OS),windows)
-	_OS_GPU_RUNNER_CPU_FLAGS=$(call uc,$(GPU_RUNNER_CPU_FLAGS))
-else ifeq ($(OS),linux)
-	_OS_GPU_RUNNER_CPU_FLAGS=$(GPU_RUNNER_CPU_FLAGS)
-endif
 
 GPU_RUNNER_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))
-DIST_GPU_RUNNER_LIB_DEPS = $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_RUNNER_LIBS)))
 
 GPU_RUNNER_SRCS := \
 	ggml-cuda.cu \
@@ -60,11 +54,11 @@ GPU_RUNNER_OBJS := $(GPU_RUNNER_SRCS:.cu=.$(GPU_RUNNER_NAME).$(OBJ_EXT))
 GPU_RUNNER_OBJS := $(GPU_RUNNER_OBJS:.c=.$(GPU_RUNNER_NAME).$(OBJ_EXT))
 GPU_RUNNER_OBJS := $(addprefix $(BUILD_DIR)/,$(GPU_RUNNER_OBJS:.cpp=.$(GPU_RUNNER_NAME).$(OBJ_EXT)))
 
-DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)))
+DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)))
 ifneq ($(OS),windows)
-PAYLOAD_RUNNERS = $(addprefix $(RUNNERS_PAYLOAD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT).gz,$(GPU_RUNNER_NAME)))
+PAYLOAD_RUNNERS = $(addprefix $(RUNNERS_PAYLOAD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT).gz,$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)))
 endif
-BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)))
+BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)))
 
 
 $(GPU_RUNNER_NAME): $(BUILD_RUNNERS) 
@@ -83,11 +77,11 @@ $(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.c
 $(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cpp
 	@-mkdir -p $(dir $@)
 	$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CXXFLAGS) -o $@ $<
-$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/" $(CGO_EXTRA_LDFLAGS)
-$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
+$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/" $(CGO_EXTRA_LDFLAGS)
+$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
 	@-mkdir -p $(dir $@)
-	GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie  $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner
-$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(DIST_GPU_RUNNER_LIB_DEPS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
+	GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner
+$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
 	@-mkdir -p $(dir $@)
 	$(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@
 
@@ -95,13 +89,10 @@ $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).
 $(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
 	@-mkdir -p $(dir $@)
 	$(CP) $< $@
-$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) $(GPU_DIST_DEPS_LIBS)
-$(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT)
+$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) $(GPU_DIST_DEPS_LIBS)
+$(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT)
 	@-mkdir -p $(dir $@)
 	$(CP) $< $@
-$(DIST_GPU_RUNNER_LIB_DEPS): 
-	@-mkdir -p $(dir $@)
-	$(CP) $(GPU_LIB_DIR)/$(notdir $@) $(dir $@)
 $(GPU_DIST_DEPS_LIBS): 
 	@-mkdir -p $(dir $@)
 	$(CP) $(dir $(filter %$(notdir $@),$(GPU_LIBS) $(GPU_TRANSITIVE_LIBS)))/$(notdir $@) $(dir $@)
@@ -110,7 +101,7 @@ $(GPU_DIST_DEPS_LIBS):
 $(RUNNERS_PAYLOAD_DIR)/%/ollama_llama_server.gz: $(RUNNERS_BUILD_DIR)/%/ollama_llama_server 
 	@-mkdir -p $(dir $@)
 	${GZIP} --best -c $< > $@
-$(RUNNERS_PAYLOAD_DIR)/$(GPU_RUNNER_NAME)/%.gz: $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/%
+$(RUNNERS_PAYLOAD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/%.gz: $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/%
 	@-mkdir -p $(dir $@)
 	${GZIP} --best -c $< > $@
 
diff --git a/llama/runner/requirements.go b/llama/runner/requirements.go
deleted file mode 100644
index 71b3b9aa..00000000
--- a/llama/runner/requirements.go
+++ /dev/null
@@ -1,19 +0,0 @@
-package main
-
-import (
-	"encoding/json"
-	"os"
-
-	"github.com/ollama/ollama/llama"
-	"github.com/ollama/ollama/version"
-)
-
-func printRequirements(fp *os.File) {
-	attrs := map[string]string{
-		"system_info":  llama.PrintSystemInfo(),
-		"version":      version.Version,
-		"cpu_features": llama.CpuFeatures,
-	}
-	enc := json.NewEncoder(fp)
-	_ = enc.Encode(attrs)
-}
diff --git a/llama/runner/runner.go b/llama/runner/runner.go
index b680f060..b74171b2 100644
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -818,13 +818,8 @@ func main() {
 	mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
 	tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
 	multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
-	requirements := flag.Bool("requirements", false, "print json requirement information")
 
 	flag.Parse()
-	if *requirements {
-		printRequirements(os.Stdout)
-		return
-	}
 	level := slog.LevelInfo
 	if *verbose {
 		level = slog.LevelDebug
diff --git a/runners/common.go b/runners/common.go
index 29436468..de9fde0c 100644
--- a/runners/common.go
+++ b/runners/common.go
@@ -17,6 +17,7 @@ import (
 	"syscall"
 
 	"golang.org/x/sync/errgroup"
+	"golang.org/x/sys/cpu"
 
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
@@ -288,7 +289,16 @@ func GetAvailableServers(payloadsDir string) map[string]string {
 	servers := make(map[string]string)
 	for _, file := range files {
 		slog.Debug("availableServers : found", "file", file)
-		servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
+		runnerName := filepath.Base(filepath.Dir(file))
+		// Special case for our GPU runners - if compiled with standard AVX flag
+		// detect incompatible system
+		// Custom builds will omit this and its up to the user to ensure compatibility
+		parsed := strings.Split(runnerName, "_")
+		if len(parsed) == 3 && parsed[2] == "avx" && !cpu.X86.HasAVX {
+			slog.Info("GPU runner incompatible with host system, CPU does not have AVX", "runner", runnerName)
+			continue
+		}
+		servers[runnerName] = filepath.Dir(file)
 	}
 
 	return servers
diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1
index 32ba7652..40e2569b 100644
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -82,7 +82,7 @@ function buildOllama() {
     if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) {
         write-host "Building ollama runners"
         Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
-        & make -C llama -j 12
+        & make -C llama -j 12 dist
         if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
     } else {
         write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"