diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index a0f5e4d4..37327a6a 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -85,7 +85,7 @@ jobs: import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll' Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo' if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" } - make + make dist name: make - uses: actions/upload-artifact@v4 with: @@ -143,8 +143,8 @@ jobs: import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll' Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo' if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" } - make -C llama print-HIP_PATH print-HIP_LIB_DIR - make rocm + make help-runners + make dist_rocm - uses: actions/upload-artifact@v4 with: name: generate-windows-rocm @@ -226,7 +226,7 @@ jobs: import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll' Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo' if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" } - make cuda_v$(($env:CUDA_PATH | split-path -leaf) -replace 'v(\d+).*', '$1') + make dist_cuda_v$(($env:CUDA_PATH | split-path -leaf) -replace 'v(\d+).*', '$1') - uses: actions/upload-artifact@v4 with: name: generate-windows-cuda-${{ matrix.cuda.version }} diff --git a/Dockerfile b/Dockerfile index ca09325c..9750c74e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,7 @@ ARG GOLANG_VERSION=1.22.8 ARG CMAKE_VERSION=3.22.1 ARG CUDA_VERSION_11=11.3.1 -ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86" ARG CUDA_VERSION_12=12.4.0 -ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a" ARG ROCM_VERSION=6.1.2 ARG JETPACK_6=r36.2.0 ARG JETPACK_5=r35.4.1 @@ -62,7 +60,7 @@ RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH:/usr/local/cuda/bin ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64 ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64 -ENV GOARCH amd64 +ENV GOARCH arm64 ENV CGO_ENABLED 1 WORKDIR /go/src/github.com/ollama/ollama/ ENTRYPOINT [ "zsh" ] @@ -70,29 +68,21 @@ ENTRYPOINT [ "zsh" ] FROM --platform=linux/amd64 unified-builder-amd64 AS runners-amd64 COPY . . ARG OLLAMA_SKIP_CUDA_GENERATE -ARG OLLAMA_SKIP_CUDA_11_GENERATE -ARG OLLAMA_SKIP_CUDA_12_GENERATE ARG OLLAMA_SKIP_ROCM_GENERATE -ARG CUDA_V11_ARCHITECTURES -ARG CUDA_V12_ARCHITECTURES ARG OLLAMA_FAST_BUILD RUN --mount=type=cache,target=/root/.ccache \ if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \ - make -j $(expr $(nproc) / 2 ) ; \ + make -j $(expr $(nproc) / 2 ) dist payload ; \ else \ - make -j 5 ; \ + make -j 5 dist payload ; \ fi FROM --platform=linux/arm64 unified-builder-arm64 AS runners-arm64 COPY . . ARG OLLAMA_SKIP_CUDA_GENERATE -ARG OLLAMA_SKIP_CUDA_11_GENERATE -ARG OLLAMA_SKIP_CUDA_12_GENERATE -ARG CUDA_V11_ARCHITECTURES -ARG CUDA_V12_ARCHITECTURES ARG OLLAMA_FAST_BUILD RUN --mount=type=cache,target=/root/.ccache \ - make -j 5 + make -j 5 dist payload # Jetsons need to be built in discrete stages FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS runners-jetpack5-arm64 @@ -107,7 +97,7 @@ COPY . . ARG CGO_CFLAGS ENV GOARCH arm64 RUN --mount=type=cache,target=/root/.ccache \ - make -j 5 cuda_v11 \ + make -j 5 dist_cuda_v11 payload_cuda_v11 \ CUDA_ARCHITECTURES="72;87" \ GPU_RUNNER_VARIANT=_jetpack5 \ CGO_EXTRA_LDFLAGS_LINUX=-L/usr/local/cuda/lib64/stubs \ @@ -126,7 +116,7 @@ COPY . . ARG CGO_CFLAGS ENV GOARCH arm64 RUN --mount=type=cache,target=/root/.ccache \ - make -j 5 cuda_v12 \ + make -j 5 dist_cuda_v12 payload_cuda_v12 \ CUDA_ARCHITECTURES="87" \ GPU_RUNNER_VARIANT=_jetpack6 \ CGO_EXTRA_LDFLAGS_LINUX=-L/usr/local/cuda/lib64/stubs \ diff --git a/docs/development.md b/docs/development.md index 3520dc79..202c867e 100644 --- a/docs/development.md +++ b/docs/development.md @@ -3,7 +3,7 @@ Install required tools: - go version 1.22 or higher -- OS specific native compiler (see below) +- OS specific C/C++ compiler (see below) - GNU Make @@ -69,16 +69,6 @@ make -j 5 ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root. -#### Advanced CPU Settings - -By default, running `make` will compile a few different variations -of the LLM library based on common CPU families and vector math capabilities, -including a lowest-common-denominator which should run on almost any 64 bit CPU -somewhat slowly. At runtime, Ollama will auto-detect the optimal variation to -load. - -Custom CPU settings are not currently supported in the new Go server build but will be added back after we complete the transition. - #### Containerized Linux Build If you have Docker and buildx available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting artifacts are placed in `./dist` and by default the script builds both arm64 and amd64 binaries. If you want to build only amd64, you can build with `PLATFORM=linux/amd64 ./scripts/build_linux.sh` @@ -142,3 +132,30 @@ pacman -S mingw-w64-clang-aarch64-clang mingw-w64-clang-aarch64-gcc-compat mingw ``` You will need to ensure your PATH includes go, cmake, gcc and clang mingw32-make to build ollama from source. (typically `C:\msys64\clangarm64\bin\`) + + +## Advanced CPU Vector Settings + +On x86, running `make` will compile several CPU runners which can run on different CPU families. At runtime, Ollama will auto-detect the best variation to load. If GPU libraries are present at build time, Ollama also compiles GPU runners with the `AVX` CPU vector feature enabled. This provides a good performance balance when loading large models that split across GPU and CPU with broad compatibility. Some users may prefer no vector extensions (e.g. older Xeon/Celeron processors, or hypervisors that mask the vector features) while other users may prefer turning on many more vector extensions to further improve performance for split model loads. + +To customize the set of CPU vector features enabled for a CPU runner and all GPU runners, use CUSTOM_CPU_FLAGS during the build. + +To build without any vector flags: + +``` +make CUSTOM_CPU_FLAGS="" +``` + +To build with both AVX and AVX2: +``` +make CUSTOM_CPU_FLAGS=avx,avx2 +``` + +To build with AVX512 features turned on: + +``` +make CUSTOM_CPU_FLAGS=avx,avx2,avx512,avx512vbmi,avx512vnni,avx512bf16 +``` + +> [!NOTE] +> If you are experimenting with different flags, make sure to do a `make clean` between each change to ensure everything is rebuilt with the new compiler flags diff --git a/llama/Makefile b/llama/Makefile index 43fe069c..6b04312c 100644 --- a/llama/Makefile +++ b/llama/Makefile @@ -4,26 +4,20 @@ include make/common-defs.make RUNNER_TARGETS := default # Determine which if any GPU runners we should build +include make/cuda-v11-defs.make +include make/cuda-v12-defs.make + ifeq ($(OS),windows) - CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown - CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null)) - CUDA_11:=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null) - CUDA_11_COMPILER:=$(wildcard $(CUDA_11)/bin/nvcc.exe) - CUDA_12:=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null) - CUDA_12_COMPILER:=$(wildcard $(CUDA_12)/bin/nvcc.exe) HIP_LIB_DIR := $(shell ls -d $(HIP_PATH)/lib 2>/dev/null) HIP_COMPILER:=$(wildcard $(HIP_PATH)/bin/hipcc.bin.exe) else ifeq ($(OS),linux) - CUDA_PATH?=/usr/local/cuda - CUDA_11:=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null) - CUDA_11_COMPILER:=$(wildcard $(CUDA_11)/bin/nvcc) - CUDA_12:=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null) - CUDA_12_COMPILER:=$(wildcard $(CUDA_11)/bin/nvcc) HIP_PATH?=/opt/rocm HIP_LIB_DIR := $(shell ls -d $(HIP_PATH)/lib 2>/dev/null) HIP_COMPILER:=$(wildcard $(HIP_PATH)/bin/hipcc) endif +# Without CUSTOM_CPU_FLAGS we default to build both v11 and v12 if present +ifeq ($(CUSTOM_CPU_FLAGS),) ifeq ($(OLLAMA_SKIP_CUDA_GENERATE),) ifneq ($(CUDA_11_COMPILER),) RUNNER_TARGETS += cuda_v11 @@ -32,6 +26,14 @@ ifneq ($(CUDA_12_COMPILER),) RUNNER_TARGETS += cuda_v12 endif endif +else # CUSTOM_CPU_FLAGS is set, we'll build only the latest cuda version detected +ifneq ($(CUDA_12),) + RUNNER_TARGETS += cuda_v12 +else ifneq ($(CUDA_11),) + RUNNER_TARGETS += cuda_v11 +endif +endif + ifeq ($(OLLAMA_SKIP_ROCM_GENERATE),) ifneq ($(HIP_COMPILER),) RUNNER_TARGETS += rocm @@ -46,10 +48,10 @@ dist: $(addprefix dist_, $(RUNNER_TARGETS)) dist_%: @$(MAKE) --no-print-directory -f make/Makefile.$* dist -payload: clean-payload .WAIT $(addprefix payload_, $(RUNNER_TARGETS)) +payload: $(addprefix payload_, $(RUNNER_TARGETS)) payload_%: - @$(MAKE) --no-print-directory f make/Makefile.$* dist + @$(MAKE) --no-print-directory -f make/Makefile.$* payload runners: $(RUNNER_TARGETS) @@ -65,8 +67,8 @@ help-sync apply-patches create-patches sync: test integration lint: @$(MAKE) --no-print-directory -f make/Makefile.test $@ -clean: - rm -rf $(BUILD_DIR) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS) +clean: clean-payload + rm -rf "$(BUILD_DIR)" "$(DIST_LIB_DIR)" go clean -cache clean-payload: @@ -75,9 +77,9 @@ clean-payload: help: @echo "The following make targets will help you BUILD Ollama" @echo "" - @echo " make all # (default) Build Ollama llm subprocess runners, and the primary ollama executable" + @echo " make all # (default target) Build Ollama llm subprocess runners, and the primary ollama executable" @echo " make runners # Build Ollama llm subprocess runners; after you may use 'go build .' to build the primary ollama exectuable" - @echo " make # Build specific runners. Enabled: $(RUNNER_TARGETS)" + @echo " make # Build specific runners. Enabled: '$(RUNNER_TARGETS)'" @echo " make payload # Build the runners as payloads (Linux/Mac only)" @echo " make dist # Build the runners for distribution and gather dependencies" @echo " make help-sync # Help information on vendor update targets" @@ -94,9 +96,11 @@ help: help-runners: - @echo "The following runners will be built based on discovered GPU libraries: $(RUNNER_TARGETS)" + @echo "The following runners will be built based on discovered GPU libraries: '$(RUNNER_TARGETS)'" @echo "(On MacOS arm64 'default' is the metal runner. For all other platforms 'default' is one or more CPU runners)" @echo "" + @echo "GPU Runner CPU Flags: '$(GPU_RUNNER_CPU_FLAGS)' (Override with CUSTOM_CPU_FLAGS)" + @echo "" @echo "# CUDA_PATH sets the location where CUDA toolkits are present" @echo "CUDA_PATH=$(CUDA_PATH)" @echo " CUDA_11=$(CUDA_11)" @@ -108,7 +112,7 @@ help-runners: @echo "HIP_PATH=$(HIP_PATH)" @echo " HIP_COMPILER=$(HIP_COMPILER)" -.PHONY: all exe dist payload help help-sync help-runners test integration lint runners clean clean-payload $(RUNNER_TARGETS) $(addprefix dist_, $(RUNNER_TARGETS)) $(addprefix payload_, $(RUNNER_TARGETS)) .WAIT +.PHONY: all exe dist payload help help-sync help-runners test integration lint runners clean clean-payload $(RUNNER_TARGETS) # Handy debugging for make variables print-%: diff --git a/llama/llama.go b/llama/llama.go index dbb02768..7861b69b 100644 --- a/llama/llama.go +++ b/llama/llama.go @@ -9,17 +9,19 @@ package llama #cgo amd64,avx CXXFLAGS: -mavx #cgo amd64,avx2 CFLAGS: -mavx2 -mfma #cgo amd64,avx2 CXXFLAGS: -mavx2 -mfma +#cgo amd64,avx512 CFLAGS: -mavx512f -mavx512dq -mavx512bw +#cgo amd64,avx512 CXXFLAGS: -mavx512f -mavx512dq -mavx512bw +#cgo amd64,avx512bf16 CFLAGS: -mavx512bf16 -D__AVX512BF16__ +#cgo amd64,avx512bf16 CXXFLAGS: -mavx512bf16 -D__AVX512BF16__ +#cgo amd64,avx512vbmi CFLAGS: -mavx512vbmi -D__AVX512VBMI__ +#cgo amd64,avx512vbmi CXXFLAGS: -mavx512vbmi -D__AVX512VBMI__ +#cgo amd64,avx512vnni CFLAGS: -mavx512vnni -D__AVX512VNNI__ +#cgo amd64,avx512vnni CXXFLAGS: -mavx512vnni -D__AVX512VNNI__ #cgo amd64,f16c CFLAGS: -mf16c #cgo amd64,f16c CXXFLAGS: -mf16c #cgo amd64,fma CFLAGS: -mfma #cgo amd64,fma CXXFLAGS: -mfma -#cgo avx CFLAGS: -mavx -#cgo avx CXXFLAGS: -mavx -#cgo avx2 CFLAGS: -mavx2 -mfma -mf16c -#cgo avx2 CXXFLAGS: -mavx2 -mfma -mf16c #cgo cuda CFLAGS: -fPIE -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1 -#cgo cuda CFLAGS: -fPIE -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1 -#cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1 #cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1 #cgo cuda_jetpack5 LDFLAGS: -lggml_cuda_jetpack5 -L/usr/local/cuda-11/lib64 #cgo cuda_jetpack6 LDFLAGS: -lggml_cuda_jetpack6 -L/usr/local/cuda-12/lib64 @@ -37,7 +39,6 @@ package llama #cgo linux CFLAGS: -D_GNU_SOURCE #cgo linux CXXFLAGS: -D_GNU_SOURCE #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64 -#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64 #cgo linux,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA #cgo linux,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/Linux/arm64 @@ -50,14 +51,11 @@ package llama #cgo rocm LDFLAGS: -L${SRCDIR} -lggml_rocm -lhipblas -lamdhip64 -lrocblas #cgo windows CFLAGS: -Wno-discarded-qualifiers -D_WIN32_WINNT=0x602 #cgo windows CXXFLAGS: -D_WIN32_WINNT=0x602 -#cgo windows LDFLAGS: -lmsvcrt #cgo windows LDFLAGS: -lmsvcrt -static-libstdc++ -static-libgcc -static #cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/Windows/amd64 -#cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/Windows/amd64 #cgo windows,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA #cgo windows,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA #cgo windows,arm64 LDFLAGS: -L${SRCDIR}/build/Windows/arm64 -#cgo windows,arm64 LDFLAGS: -L${SRCDIR}/build/Windows/arm64 #cgo windows,cuda LDFLAGS: -lcuda -lcudart -lcublas -lcublasLt #cgo windows,rocm LDFLAGS: -lggml_rocm -lhipblas -lamdhip64 -lrocblas diff --git a/llama/make/Makefile.cuda_v11 b/llama/make/Makefile.cuda_v11 index 528e0efe..ab5e72e4 100644 --- a/llama/make/Makefile.cuda_v11 +++ b/llama/make/Makefile.cuda_v11 @@ -1,7 +1,7 @@ # Build rules for CUDA v11 runner include make/common-defs.make - +include make/cuda-v11-defs.make GPU_RUNNER_VARIANT := _v11 GPU_PATH_ROOT_WIN=$(shell ls -d $(dir $(shell cygpath -m -s "$(CUDA_PATH)\.."))/v11.? 2>/dev/null) diff --git a/llama/make/Makefile.cuda_v12 b/llama/make/Makefile.cuda_v12 index 2418ef00..63fd48a8 100644 --- a/llama/make/Makefile.cuda_v12 +++ b/llama/make/Makefile.cuda_v12 @@ -1,7 +1,7 @@ # Build rules for CUDA v12 runner include make/common-defs.make - +include make/cuda-v12-defs.make GPU_RUNNER_VARIANT := _v12 GPU_PATH_ROOT_WIN=$(shell ls -d $(dir $(shell cygpath -m -s "$(CUDA_PATH)\.."))/v12.? 2>/dev/null) diff --git a/llama/make/Makefile.default b/llama/make/Makefile.default index 16011b1b..53026bfc 100644 --- a/llama/make/Makefile.default +++ b/llama/make/Makefile.default @@ -8,7 +8,7 @@ CPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERS DEFAULT_RUNNER := $(if $(and $(filter darwin,$(OS)),$(filter arm64,$(ARCH))),metal,cpu) RUNNERS := $(DEFAULT_RUNNER) ifeq ($(ARCH),amd64) -ifeq ($(CUSTOM_CPU_FLAGS),) +ifeq ($(origin CUSTOM_CPU_FLAGS),undefined) RUNNERS += cpu_avx cpu_avx2 endif endif @@ -51,7 +51,7 @@ $(RUNNERS_PAYLOAD_DIR)/%/ollama_llama_server$(EXE_EXT).gz: $(RUNNERS_BUILD_DIR)/ clean: rm -f $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS) -.PHONY: clean default +.PHONY: clean default dist # Handy debugging for make variables print-%: diff --git a/llama/make/Makefile.rocm b/llama/make/Makefile.rocm index 8697fe43..79b5e7c5 100644 --- a/llama/make/Makefile.rocm +++ b/llama/make/Makefile.rocm @@ -56,9 +56,12 @@ else ifeq ($(OS),windows) endif GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(HIP_ARCHS)),--offload-arch=$(arch)) +# HIPCC uses clang which requires avx512 -> -mavx512f -mavx512dq -mavx512bw +GPU_VECTOR_FLAGS=$(if $(filter avx512,$(GPU_RUNNER_CPU_FLAGS)),avx512f avx512dq avx512bw) $(filter-out avx512,$(GPU_RUNNER_CPU_FLAGS)) + GPU_COMPILER_CUFLAGS = \ $(GPU_COMPILER_FPIC) \ - $(addprefix -m,$(GPU_RUNNER_CPU_FLAGS)) \ + $(addprefix -m,$(GPU_VECTOR_FLAGS)) \ -mf16c \ -mfma \ -parallel-jobs=2 \ @@ -102,7 +105,7 @@ endif include make/gpu.make # Adjust the rules from gpu.make to handle the ROCm dependencies properly -$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(ROCBLAS_DIST_DEP_MANIFEST) +$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(ROCBLAS_DIST_DEP_MANIFEST) $(ROCBLAS_DIST_DEP_MANIFEST): @-mkdir -p $(dir $@) @echo "Copying rocblas library..." diff --git a/llama/make/Makefile.test b/llama/make/Makefile.test index e6423720..9b70e934 100644 --- a/llama/make/Makefile.test +++ b/llama/make/Makefile.test @@ -11,7 +11,7 @@ integration: $(OLLAMA_EXE) lint: cd $(abspath $(SRC_DIR)/..) && golangci-lint run -v -# Note: in this makefile we error instead of building to allow more fine-grain control if testing flows +# Note: in this makefile we error instead of building to allow more fine-grain control of testing flows $(OLLAMA_EXE): @echo "" @echo "ERROR: You must build ollama first - use 'make all' to build the ollama binaries" diff --git a/llama/make/common-defs.make b/llama/make/common-defs.make index 006c6001..77f8bca3 100644 --- a/llama/make/common-defs.make +++ b/llama/make/common-defs.make @@ -44,9 +44,14 @@ ifneq ($(CCACHE),) endif -# Override in environment space separated to tune GPU runner CPU vector flags +# Override in environment to tune CPU vector flags ifeq ($(ARCH),amd64) - GPU_RUNNER_CPU_FLAGS ?= avx +ifeq ($(origin CUSTOM_CPU_FLAGS),undefined) + GPU_RUNNER_CPU_FLAGS=avx + GPU_RUNNER_EXTRA_VARIANT=_avx +else + GPU_RUNNER_CPU_FLAGS=$(subst $(comma),$(space),$(CUSTOM_CPU_FLAGS)) +endif endif ifeq ($(OS),windows) diff --git a/llama/make/cuda-v11-defs.make b/llama/make/cuda-v11-defs.make new file mode 100644 index 00000000..71a21edd --- /dev/null +++ b/llama/make/cuda-v11-defs.make @@ -0,0 +1,13 @@ +# Common definitions for the various Makefiles which set cuda settings +# No rules are defined here so this is safe to include at the beginning of other makefiles + +ifeq ($(OS),windows) + CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown + CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null)) + CUDA_11:=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null) + CUDA_11_COMPILER:=$(wildcard $(CUDA_11)/bin/nvcc.exe) +else ifeq ($(OS),linux) + CUDA_PATH?=/usr/local/cuda + CUDA_11:=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null) + CUDA_11_COMPILER:=$(wildcard $(CUDA_11)/bin/nvcc) +endif diff --git a/llama/make/cuda-v12-defs.make b/llama/make/cuda-v12-defs.make new file mode 100644 index 00000000..45fd47a7 --- /dev/null +++ b/llama/make/cuda-v12-defs.make @@ -0,0 +1,13 @@ +# Common definitions for the various Makefiles which set cuda settings +# No rules are defined here so this is safe to include at the beginning of other makefiles + +ifeq ($(OS),windows) + CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown + CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null)) + CUDA_12:=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null) + CUDA_12_COMPILER:=$(wildcard $(CUDA_12)/bin/nvcc.exe) +else ifeq ($(OS),linux) + CUDA_PATH?=/usr/local/cuda + CUDA_12:=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null) + CUDA_12_COMPILER:=$(wildcard $(CUDA_12)/bin/nvcc) +endif diff --git a/llama/make/cuda.make b/llama/make/cuda.make index 7a4b1036..aa5eb060 100644 --- a/llama/make/cuda.make +++ b/llama/make/cuda.make @@ -11,26 +11,38 @@ GPU_RUNNER_GO_TAGS := cuda cuda$(GPU_RUNNER_VARIANT) GPU_RUNNER_DRIVER_LIB_LINK := -lcuda GPU_RUNNER_LIBS_SHORT := cublas cudart cublasLt GPU_LIB_DIR_WIN = $(GPU_PATH_ROOT_WIN)/bin -GPU_LIB_DIR_LINUX = $(GPU_PATH_ROOT_LINUX)/lib64 +ifneq ($(GPU_PATH_ROOT_LINUX),) +GPU_LIB_DIR_LINUX=$(strip $(shell ls -d $(GPU_PATH_ROOT_LINUX)/lib64 2>/dev/null || ls -d $(GPU_PATH_ROOT_LINUX)/lib 2>/dev/null)) +GPU_COMPILER_LINUX = $(GPU_PATH_ROOT_LINUX)/bin/nvcc +endif CGO_EXTRA_LDFLAGS_WIN = -L"$(GPU_PATH_ROOT_WIN)/lib/x64" GPU_COMPILER_WIN = $(GPU_PATH_ROOT_WIN)/bin/nvcc -GPU_COMPILER_LINUX = $(GPU_PATH_ROOT_LINUX)/bin/nvcc GPU_COMPILER_CFLAGS_WIN = $(CFLAGS) -D_WIN32_WINNT=0x602 GPU_COMPILER_CFLAGS_LINUX = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602 GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE -GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT)*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))) + +ifeq ($(OS),windows) + # On windows, nvcc uses msvc which does not support avx512vbmi avx512vnni avx512bf16, but macros can turn them on + GPU_VECTOR_FLAGS=$(call uc,$(filter-out avx512bf16,$(filter-out avx512vnni,$(filter-out avx512vbmi,$(GPU_RUNNER_CPU_FLAGS))))) + GPU_COMPILER_EXTRA_FLAGS=$(if $(filter avx512vbmi,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512VBMI__) + GPU_COMPILER_EXTRA_FLAGS+=$(if $(filter avx512vnni,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512VNNI__) + GPU_COMPILER_EXTRA_FLAGS+=$(if $(filter avx512bf16,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512BF16__) + GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))) +else ifeq ($(OS),linux) + # On linux, nvcc requires avx512 -> -mavx512f -mavx512dq -mavx512bw + GPU_VECTOR_FLAGS=$(if $(filter avx512,$(GPU_RUNNER_CPU_FLAGS)),avx512f avx512dq avx512bw) $(filter-out avx512,$(GPU_RUNNER_CPU_FLAGS)) + CUDA_PATH?=/usr/local/cuda + GPU_COMPILER_EXTRA_FLAGS = -fPIC -Wno-unused-function -std=c++11 + GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))) +endif GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_LIBS)))) -ifeq ($(OS),linux) - CUDA_PATH?=/usr/local/cuda - GPU_COMPILER_FPIC = -fPIC -Wno-unused-function -std=c++11 -endif GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(CUDA_ARCHITECTURES)),--generate-code=arch=compute_$(arch)$(comma)code=[compute_$(arch)$(comma)sm_$(arch)]) \ -DGGML_CUDA_USE_GRAPHS=1 GPU_COMPILER_CUFLAGS = \ - $(GPU_COMPILER_FPIC) \ - -Xcompiler "$(addprefix $(CPU_FLAG_PREFIX),$(_OS_GPU_RUNNER_CPU_FLAGS))" \ + $(GPU_COMPILER_EXTRA_FLAGS) \ + -Xcompiler "$(addprefix $(CPU_FLAG_PREFIX),$(GPU_VECTOR_FLAGS))" \ -t2 \ -DGGML_CUDA_DMMV_X=32 \ -DGGML_CUDA_MMV_Y=1 \ diff --git a/llama/make/gpu.make b/llama/make/gpu.make index b76af061..39354fb9 100644 --- a/llama/make/gpu.make +++ b/llama/make/gpu.make @@ -25,14 +25,8 @@ GPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERS # today, cuda is bundled, but rocm is split out. Should split them each out by runner DIST_GPU_RUNNER_DEPS_DIR = $(DIST_LIB_DIR) -ifeq ($(OS),windows) - _OS_GPU_RUNNER_CPU_FLAGS=$(call uc,$(GPU_RUNNER_CPU_FLAGS)) -else ifeq ($(OS),linux) - _OS_GPU_RUNNER_CPU_FLAGS=$(GPU_RUNNER_CPU_FLAGS) -endif GPU_RUNNER_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))) -DIST_GPU_RUNNER_LIB_DEPS = $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_RUNNER_LIBS))) GPU_RUNNER_SRCS := \ ggml-cuda.cu \ @@ -60,11 +54,11 @@ GPU_RUNNER_OBJS := $(GPU_RUNNER_SRCS:.cu=.$(GPU_RUNNER_NAME).$(OBJ_EXT)) GPU_RUNNER_OBJS := $(GPU_RUNNER_OBJS:.c=.$(GPU_RUNNER_NAME).$(OBJ_EXT)) GPU_RUNNER_OBJS := $(addprefix $(BUILD_DIR)/,$(GPU_RUNNER_OBJS:.cpp=.$(GPU_RUNNER_NAME).$(OBJ_EXT))) -DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME))) +DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT))) ifneq ($(OS),windows) -PAYLOAD_RUNNERS = $(addprefix $(RUNNERS_PAYLOAD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT).gz,$(GPU_RUNNER_NAME))) +PAYLOAD_RUNNERS = $(addprefix $(RUNNERS_PAYLOAD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT).gz,$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT))) endif -BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME))) +BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT))) $(GPU_RUNNER_NAME): $(BUILD_RUNNERS) @@ -83,11 +77,11 @@ $(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.c $(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cpp @-mkdir -p $(dir $@) $(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CXXFLAGS) -o $@ $< -$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/" $(CGO_EXTRA_LDFLAGS) -$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS) +$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/" $(CGO_EXTRA_LDFLAGS) +$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS) @-mkdir -p $(dir $@) - GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner -$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(DIST_GPU_RUNNER_LIB_DEPS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS) + GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner +$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS) @-mkdir -p $(dir $@) $(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@ @@ -95,13 +89,10 @@ $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME). $(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/% @-mkdir -p $(dir $@) $(CP) $< $@ -$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) $(GPU_DIST_DEPS_LIBS) -$(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) +$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) $(GPU_DIST_DEPS_LIBS) +$(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) @-mkdir -p $(dir $@) $(CP) $< $@ -$(DIST_GPU_RUNNER_LIB_DEPS): - @-mkdir -p $(dir $@) - $(CP) $(GPU_LIB_DIR)/$(notdir $@) $(dir $@) $(GPU_DIST_DEPS_LIBS): @-mkdir -p $(dir $@) $(CP) $(dir $(filter %$(notdir $@),$(GPU_LIBS) $(GPU_TRANSITIVE_LIBS)))/$(notdir $@) $(dir $@) @@ -110,7 +101,7 @@ $(GPU_DIST_DEPS_LIBS): $(RUNNERS_PAYLOAD_DIR)/%/ollama_llama_server.gz: $(RUNNERS_BUILD_DIR)/%/ollama_llama_server @-mkdir -p $(dir $@) ${GZIP} --best -c $< > $@ -$(RUNNERS_PAYLOAD_DIR)/$(GPU_RUNNER_NAME)/%.gz: $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/% +$(RUNNERS_PAYLOAD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/%.gz: $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/% @-mkdir -p $(dir $@) ${GZIP} --best -c $< > $@ diff --git a/llama/runner/requirements.go b/llama/runner/requirements.go deleted file mode 100644 index 71b3b9aa..00000000 --- a/llama/runner/requirements.go +++ /dev/null @@ -1,19 +0,0 @@ -package main - -import ( - "encoding/json" - "os" - - "github.com/ollama/ollama/llama" - "github.com/ollama/ollama/version" -) - -func printRequirements(fp *os.File) { - attrs := map[string]string{ - "system_info": llama.PrintSystemInfo(), - "version": version.Version, - "cpu_features": llama.CpuFeatures, - } - enc := json.NewEncoder(fp) - _ = enc.Encode(attrs) -} diff --git a/llama/runner/runner.go b/llama/runner/runner.go index b680f060..b74171b2 100644 --- a/llama/runner/runner.go +++ b/llama/runner/runner.go @@ -818,13 +818,8 @@ func main() { mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing") tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions") multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users") - requirements := flag.Bool("requirements", false, "print json requirement information") flag.Parse() - if *requirements { - printRequirements(os.Stdout) - return - } level := slog.LevelInfo if *verbose { level = slog.LevelDebug diff --git a/runners/common.go b/runners/common.go index 29436468..de9fde0c 100644 --- a/runners/common.go +++ b/runners/common.go @@ -17,6 +17,7 @@ import ( "syscall" "golang.org/x/sync/errgroup" + "golang.org/x/sys/cpu" "github.com/ollama/ollama/discover" "github.com/ollama/ollama/envconfig" @@ -288,7 +289,16 @@ func GetAvailableServers(payloadsDir string) map[string]string { servers := make(map[string]string) for _, file := range files { slog.Debug("availableServers : found", "file", file) - servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file) + runnerName := filepath.Base(filepath.Dir(file)) + // Special case for our GPU runners - if compiled with standard AVX flag + // detect incompatible system + // Custom builds will omit this and its up to the user to ensure compatibility + parsed := strings.Split(runnerName, "_") + if len(parsed) == 3 && parsed[2] == "avx" && !cpu.X86.HasAVX { + slog.Info("GPU runner incompatible with host system, CPU does not have AVX", "runner", runnerName) + continue + } + servers[runnerName] = filepath.Dir(file) } return servers diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1 index 32ba7652..40e2569b 100644 --- a/scripts/build_windows.ps1 +++ b/scripts/build_windows.ps1 @@ -82,7 +82,7 @@ function buildOllama() { if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) { write-host "Building ollama runners" Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}" - & make -C llama -j 12 + & make -C llama -j 12 dist if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} } else { write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"