From 751009a5d76e29fc7d2b5f4066f04671f8a911fa Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 1 Aug 2024 08:54:44 -0700 Subject: [PATCH] Runtime selection of new or old runners This adjusts the new runners to comingle with existing runners so we can use an env var to toggle the new runners on. --- envconfig/config.go | 3 + llama/Makefile | 315 ++++++++++++++++++++++++++++++++--------- llama/runner/runner.go | 68 +++++++-- llm/llm.go | 66 +++++++++ llm/server.go | 105 +++----------- 5 files changed, 399 insertions(+), 158 deletions(-) diff --git a/envconfig/config.go b/envconfig/config.go index 908636a9..a904ebfe 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -140,6 +140,8 @@ var ( SchedSpread = Bool("OLLAMA_SCHED_SPREAD") // IntelGPU enables experimental Intel GPU detection. IntelGPU = Bool("OLLAMA_INTEL_GPU") + // Set via OLLAMA_NEW_RUNNERS in the environment + NewRunners = Bool("OLLAMA_NEW_RUNNERS") ) func String(s string) func() string { @@ -250,6 +252,7 @@ func AsMap() map[string]EnvVar { "OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"}, "OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"}, "OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"}, + "OLLAMA_NEW_RUNNERS": {"OLLAMA_NEW_RUNNERS", NewRunners(), "Enable new experimental runners"}, "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"}, "OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir(), "Location for runners"}, "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"}, diff --git a/llama/Makefile b/llama/Makefile index 89a1f636..b4d66b66 100644 --- a/llama/Makefile +++ b/llama/Makefile @@ -4,41 +4,102 @@ ifeq ($(ARCH),x86_64) ARCH := amd64 endif ifneq (,$(findstring MINGW,$(OS))$(findstring MSYS,$(OS))) - OS := Windows + OS := windows +else ifeq ($(OS),Linux) + OS := linux +else ifeq ($(OS),Darwin) + OS := darwin endif +comma:= , +empty:= +space:= $(empty) $(empty) export CGO_CFLAGS_ALLOW = -mfma|-mf16c export CGO_CXXFLAGS_ALLOW = -mfma|-mf16c export HIP_PLATFORM = amd SRC_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) -BUILD_DIR := $(SRC_DIR)/build/$(OS)/$(ARCH)/ +BUILD_DIR = $(SRC_DIR)build/$(OS)-$(ARCH) +DIST_BASE = $(abspath $(SRC_DIR)/../dist/$(OS)-$(ARCH)) +RUNNERS_DIST_DIR = $(DIST_BASE)/ollama_runners +RUNNERS_PAYLOAD_DIR = $(abspath $(SRC_DIR)/../llm/build/$(OS)/$(patsubst amd64,x86_64,$(ARCH))) +RUNNERS_BUILD_DIR = $(BUILD_DIR)/ollama_runners +DEFAULT_RUNNER := $(if $(and $(filter darwin,$(OS)),$(filter arm64,$(ARCH))),metal,cpu) -ifeq ($(OS),Windows) +CUDA_LIBS_SHORT := cublas cudart cublasLt +ROCM_LIBS_SHORT := hipblas rocblas + +ifeq ($(OS),windows) + SRC_DIR := $(shell cygpath -m -s "$(SRC_DIR)") OBJ_EXT := obj SHARED_EXT := dll EXE_EXT := .exe - SHARED_PREFIX := "" - CUDA_LIB := $(shell cygpath -w -s "$(CUDA_PATH)\lib\x64") - HIP_LIB := $(shell cygpath -w -s "$(HIP_PATH)\lib") - NVCC := nvcc - # If HIP_PATH has spaces, hipcc trips over them when subprocessing - HIP_PATH := $(shell cygpath -m -s "$(HIP_PATH)\") - export HIP_PATH - HIPCC := $(HIP_PATH)bin/hipcc.bin.exe -else ifeq ($(OS),Linux) + SHARED_PREFIX := + + # TODO needs work for multiple cuda versions on windows + + CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\..")) + CUDA_11=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null) + CUDA_12=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null) + CUDA_11_LIB_DIR := $(CUDA_11)/bin + CUDA_12_LIB_DIR := $(CUDA_12)/bin + + + NVCC := $(shell X=$$(which nvcc 2>/dev/null) && cygpath -m -s "$$X") + ifneq ($(HIP_PATH),) + HIP_LIB_DIR := $(shell cygpath -m -s "$(HIP_PATH)\bin") + # If HIP_PATH has spaces, hipcc trips over them when subprocessing + HIP_PATH := $(shell cygpath -m -s "$(HIP_PATH)\") + export HIP_PATH + HIPCC := $(HIP_PATH)bin/hipcc.bin.exe + endif + CP := cp + CUDA_LIBS = $(wildcard $(addsuffix 64*.$(SHARED_EXT),$(addprefix $(CUDA_LIB_DIR)/$(SHARED_PREFIX),$(CUDA_LIBS_SHORT)))) +else ifeq ($(OS),linux) + CP := cp -a OBJ_EXT := o SHARED_EXT := so SHARED_PREFIX := lib - CUDA_PATH?=/usr/local/cuda HIP_PATH?=/opt/rocm - CUDA_LIB := "$(CUDA_PATH)/lib64" - HIP_LIB := "$(HIP_PATH)/lib" - NVCC := nvcc - HIPCC := hipcc + HIP_LIB_DIR := $(HIP_PATH)/lib + HIPCC := $(shell X=$$(which hipcc 2>/dev/null) && echo $$X) + CUDA_PATH?=/usr/local/cuda + CUDA_11=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null) + CUDA_12=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null) + CUDA_11_LIB_DIR := $(CUDA_11)/lib64 + CUDA_12_LIB_DIR := $(CUDA_12)/lib64 else OBJ_EXT := o SHARED_EXT := so + CP := cp -a +endif + +CUDA_11_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(CUDA_11_LIB_DIR)/$(SHARED_PREFIX),$(CUDA_LIBS_SHORT)))) +CUDA_12_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(CUDA_12_LIB_DIR)/$(SHARED_PREFIX),$(CUDA_LIBS_SHORT)))) +NVCC_11 = $(CUDA_11)/bin/nvcc +NVCC_12 = $(CUDA_12)/bin/nvcc + +CUDA_DEPS_DIR = $(DIST_BASE)cuda/ +ROCM_DEPS_DIR = $(DIST_BASE)rocm/ + +ifneq ($(CUDA_11),) + CUDA_11_VARIANT= _v11 + CUDA_11_LIB_DEPS = $(addprefix $(CUDA_DEPS_DIR),$(notdir $(CUDA_11_LIBS))) +endif +ifneq ($(CUDA_12),) + CUDA_12_VARIANT= _v12 + CUDA_12_LIB_DEPS = $(addprefix $(CUDA_DEPS_DIR),$(notdir $(CUDA_12_LIBS))) +endif +ifeq ($(OLLAMA_SKIP_ROCM_GENERATE),) +ifneq ($(HIPCC),) + ROCM_VERSION := $(subst $(space),.,$(wordlist 1,2,$(subst .,$(space),$(word 3,$(subst -,$(space),$(filter HIP version: %,$(shell $(HIPCC) --version))))))) + ifneq (,$(ROCM_VERSION)) + ROCM_VARIANT = _v$(ROCM_VERSION) + endif + ROCM_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT),$(addprefix $(HIP_LIB_DIR)/$(SHARED_PREFIX),$(ROCM_LIBS_SHORT)))) + ROCM_LIB_DEPS = $(addprefix $(ROCM_DEPS_DIR),$(notdir $(ROCM_LIBS))) + ROCBLAS_DEP_MANIFEST = $(ROCM_DEPS_DIR)/rocblas/library/TensileManifest.txt +endif endif CUDA_SRCS := \ @@ -51,21 +112,19 @@ CUDA_SRCS := \ $(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu) \ ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp -CUDA_OBJS := $(CUDA_SRCS:.cu=.cuda.$(OBJ_EXT)) -CUDA_OBJS := $(CUDA_OBJS:.c=.cuda.$(OBJ_EXT)) -CUDA_OBJS := $(addprefix $(BUILD_DIR),$(CUDA_OBJS:.cpp=.cuda.$(OBJ_EXT))) +CUDA_11_OBJS := $(CUDA_SRCS:.cu=.cuda.$(OBJ_EXT)) +CUDA_11_OBJS := $(CUDA_11_OBJS:.c=.cuda.$(OBJ_EXT)) +CUDA_11_OBJS := $(addprefix $(BUILD_DIR)/cuda_v11/,$(CUDA_11_OBJS:.cpp=.cuda.$(OBJ_EXT))) +CUDA_12_OBJS := $(CUDA_SRCS:.cu=.cuda.$(OBJ_EXT)) +CUDA_12_OBJS := $(CUDA_12_OBJS:.c=.cuda.$(OBJ_EXT)) +CUDA_12_OBJS := $(addprefix $(BUILD_DIR)/cuda_v12/,$(CUDA_12_OBJS:.cpp=.cuda.$(OBJ_EXT))) HIP_OBJS := $(CUDA_SRCS:.cu=.hip.$(OBJ_EXT)) HIP_OBJS := $(HIP_OBJS:.c=.hip.$(OBJ_EXT)) -HIP_OBJS := $(addprefix $(BUILD_DIR),$(HIP_OBJS:.cpp=.hip.$(OBJ_EXT))) +HIP_OBJS := $(addprefix $(BUILD_DIR)/,$(HIP_OBJS:.cpp=.hip.$(OBJ_EXT))) CUDA_FLAGS := \ - --generate-code=arch=compute_50,code=[compute_50,sm_50] \ - --generate-code=arch=compute_52,code=[compute_52,sm_52] \ - --generate-code=arch=compute_61,code=[compute_61,sm_61] \ - --generate-code=arch=compute_70,code=[compute_70,sm_70] \ - --generate-code=arch=compute_75,code=[compute_75,sm_75] \ - --generate-code=arch=compute_80,code=[compute_80,sm_80] \ + -t4 \ -DGGML_CUDA_DMMV_X=32 \ -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \ -DGGML_USE_CUDA=1 \ @@ -82,6 +141,34 @@ CUDA_FLAGS := \ -I. \ -O3 +CUDA_11_FLAGS := \ + --generate-code=arch=compute_50,code=[compute_50,sm_50] \ + --generate-code=arch=compute_52,code=[compute_52,sm_52] \ + --generate-code=arch=compute_53,code=[compute_53,sm_53] \ + --generate-code=arch=compute_60,code=[compute_60,sm_60] \ + --generate-code=arch=compute_61,code=[compute_61,sm_61] \ + --generate-code=arch=compute_62,code=[compute_62,sm_62] \ + --generate-code=arch=compute_70,code=[compute_70,sm_70] \ + --generate-code=arch=compute_72,code=[compute_72,sm_72] \ + --generate-code=arch=compute_75,code=[compute_75,sm_75] \ + --generate-code=arch=compute_80,code=[compute_80,sm_80] \ + --generate-code=arch=compute_86,code=[compute_86,sm_86] + +CUDA_12_FLAGS := \ + --generate-code=arch=compute_60,code=[compute_60,sm_60] \ + --generate-code=arch=compute_61,code=[compute_61,sm_61] \ + --generate-code=arch=compute_62,code=[compute_62,sm_62] \ + --generate-code=arch=compute_70,code=[compute_70,sm_70] \ + --generate-code=arch=compute_72,code=[compute_72,sm_72] \ + --generate-code=arch=compute_75,code=[compute_75,sm_75] \ + --generate-code=arch=compute_80,code=[compute_80,sm_80] \ + --generate-code=arch=compute_86,code=[compute_86,sm_86] \ + --generate-code=arch=compute_87,code=[compute_87,sm_87] \ + --generate-code=arch=compute_89,code=[compute_89,sm_89] \ + --generate-code=arch=compute_90,code=[compute_90,sm_90] \ + --generate-code=arch=compute_90a,code=[compute_90a,sm_90a] \ + -DGGML_CUDA_USE_GRAPHS=on + HIP_ARCHS := gfx900 gfx940 gfx941 gfx942 gfx1010 gfx1012 gfx1030 gfx1100 gfx1101 gfx1102 LINUX_HIP_ARCHS := gfx906:xnack- gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- @@ -111,79 +198,181 @@ HIP_FLAGS := \ -I. \ $(foreach arch, $(HIP_ARCHS), --offload-arch=$(arch)) -ifeq ($(OS), Linux) - HIP_FLAGS += $(foreach arch, $(LINUX_HIP_ARCHS), --offload-arch=$(arch)) -fPIC - CUDA_FLAGS += -fPIC - CFLAGS += -Xcompiler -fPIC -D_GNU_SOURCE - CXXFLAGS += -Xcompiler -fPIC -D_GNU_SOURCE -else ifeq ($(OS),Windows) +ifeq ($(OS),linux) + HIP_FLAGS += $(foreach arch, $(LINUX_HIP_ARCHS), --offload-arch=$(arch)) -fPIC -Wno-unused-function + CUDA_FLAGS += -fPIC -Wno-unused-function + NVCC_CFLAGS = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE + NVCC_CXXFLAGS = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE + HIPCC_CFLAGS = $(CFLAGS) -fPIC -D_GNU_SOURCE + HIPCC_CXXFLAGS = $(CXXFLAGS) -fPIC -D_GNU_SOURCE +else ifeq ($(OS),windows) HIP_FLAGS += -Xclang --dependent-lib=msvcrt + CFLAGS += -D_WIN32_WINNT=0x602 + CXXFLAGS += -D_WIN32_WINNT=0x602 + NVCC_CFLAGS = $(CFLAGS) + NVCC_CXXFLAGS = $(CXXFLAGS) + HIPCC_CFLAGS = $(CFLAGS) + HIPCC_CXXFLAGS = $(CXXFLAGS) endif -RUNNERS := $(BUILD_DIR)ollama_runner$(EXE_EXT) +ifeq ($(OLLAMA_SKIP_CPU_GENERATE),) +RUNNERS := $(DEFAULT_RUNNER) ifeq ($(ARCH),amd64) - RUNNERS += $(BUILD_DIR)ollama_runner_avx$(EXE_EXT) $(BUILD_DIR)ollama_runner_avx2$(EXE_EXT) + RUNNERS += cpu_avx cpu_avx2 endif -ifneq ($(NVCC),) - RUNNERS += $(BUILD_DIR)ollama_runner_cuda$(EXE_EXT) endif +ifeq ($(OLLAMA_SKIP_CUDA_GENERATE),) +ifneq ($(CUDA_11),) + RUNNERS += cuda_v11 +endif +ifneq ($(CUDA_12),) + RUNNERS += cuda_v12 +endif +endif +ifeq ($(OLLAMA_SKIP_ROCM_GENERATE),) ifneq ($(HIPCC),) - RUNNERS += $(BUILD_DIR)ollama_runner_rocm$(EXE_EXT) + RUNNERS += rocm$(ROCM_VARIANT) +endif endif -runners: $(RUNNERS) +DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_runner$(EXE_EXT),$(RUNNERS))) +PAYLOAD_RUNNERS = $(addprefix $(RUNNERS_PAYLOAD_DIR)/,$(addsuffix /ollama_runner$(EXE_EXT).gz,$(addsuffix /bin,$(RUNNERS)))) +BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_runner$(EXE_EXT),$(RUNNERS))) -$(BUILD_DIR)%.cuda.$(OBJ_EXT): %.cu +all: dist payload + +dist: $(DIST_RUNNERS) $(ROCBLAS_DEP_MANIFEST) + +ifeq ($(OS),windows) +# Unused on windows as we don't cary the payloads in the go binary +payload: +else +payload: $(PAYLOAD_RUNNERS) +endif + +runners: $(BUILD_RUNNERS) + +$(BUILD_DIR)/cuda_v11/%.cuda.$(OBJ_EXT): %.cu @-mkdir -p $(dir $@) - $(NVCC) -c $(CUDA_FLAGS) -o $@ $< + $(NVCC_11) -c $(CUDA_FLAGS) $(CUDA_11_FLAGS) -o $@ $< -$(BUILD_DIR)%.cuda.$(OBJ_EXT): %.c +$(BUILD_DIR)/cuda_v11/%.cuda.$(OBJ_EXT): %.c @-mkdir -p $(dir $@) - $(NVCC) -c $(CFLAGS) -o $@ $< + $(NVCC_11) -c $(NVCC_CFLAGS) -o $@ $< -$(BUILD_DIR)%.cuda.$(OBJ_EXT): %.cpp +$(BUILD_DIR)/cuda_v11/%.cuda.$(OBJ_EXT): %.cpp @-mkdir -p $(dir $@) - $(NVCC) -c $(CXXFLAGS) -o $@ $< + $(NVCC_11) -c $(NVCC_CXXFLAGS) -o $@ $< -$(BUILD_DIR)$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT): $(CUDA_OBJS) +$(BUILD_DIR)/cuda_v12/%.cuda.$(OBJ_EXT): %.cu @-mkdir -p $(dir $@) - $(NVCC) --shared -lcuda -lcublas -lcudart -lcublasLt $(CUDA_FLAGS) $(CUDA_OBJS) -o $@ + $(NVCC_12) -c $(CUDA_FLAGS) $(CUDA_12_FLAGS) -o $@ $< -$(BUILD_DIR)%.hip.$(OBJ_EXT): %.cu +$(BUILD_DIR)/cuda_v12/%.cuda.$(OBJ_EXT): %.c + @-mkdir -p $(dir $@) + $(NVCC_12) -c $(NVCC_CFLAGS) -o $@ $< + +$(BUILD_DIR)/cuda_v12/%.cuda.$(OBJ_EXT): %.cpp + @-mkdir -p $(dir $@) + $(NVCC_12) -c $(NVCC_CXXFLAGS) -o $@ $< + +$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/% + @-mkdir -p $(dir $@) + cp $< $@ + +$(RUNNERS_DIST_DIR)/cuda_v11/ollama_runner$(EXE_EXT): $(RUNNERS_DIST_DIR)/cuda_v11/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT) +$(RUNNERS_PAYLOAD_DIR)/cuda_v11/bin/ollama_runner$(EXE_EXT).gz: $(RUNNERS_PAYLOAD_DIR)/cuda_v11/bin/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT).gz +$(RUNNERS_DIST_DIR)/cuda_v12/ollama_runner$(EXE_EXT): $(RUNNERS_DIST_DIR)/cuda_v12/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT) +$(RUNNERS_PAYLOAD_DIR)/cuda_v12/bin/ollama_runner$(EXE_EXT).gz: $(RUNNERS_PAYLOAD_DIR)/cuda_v12/bin/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT).gz + +$(RUNNERS_BUILD_DIR)/cuda_v11/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT): $(CUDA_11_OBJS) $(CUDA_11_LIB_DEPS) + @-mkdir -p $(dir $@) + $(NVCC_11) --shared -lcuda -L${CUDA_DEPS_DIR} $(foreach lib, $(CUDA_LIBS_SHORT), -l$(lib)) $(CUDA_FLAGS) $(CUDA_11_FLAGS) $(CUDA_11_OBJS) -o $@ + +$(RUNNERS_BUILD_DIR)/cuda_v12/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT): $(CUDA_12_OBJS) $(CUDA_12_LIB_DEPS) + @-mkdir -p $(dir $@) + $(NVCC_12) --shared -lcuda -L${CUDA_DEPS_DIR} $(foreach lib, $(CUDA_LIBS_SHORT), -l$(lib)) $(CUDA_FLAGS) $(CUDA_12_FLAGS) $(CUDA_12_OBJS) -o $@ + +$(CUDA_11_LIB_DEPS): + @-mkdir -p $(dir $@) + $(CP) $(CUDA_11_LIB_DIR)/$(notdir $@)* $(dir $@) + +$(CUDA_12_LIB_DEPS): + @-mkdir -p $(dir $@) + $(CP) $(CUDA_12_LIB_DIR)/$(notdir $@)* $(dir $@) + +$(BUILD_DIR)/%.hip.$(OBJ_EXT): %.cu @-mkdir -p $(dir $@) $(HIPCC) -c $(HIP_FLAGS) -o $@ $< -$(BUILD_DIR)%.hip.$(OBJ_EXT): %.c +$(BUILD_DIR)/%.hip.$(OBJ_EXT): %.c @-mkdir -p $(dir $@) - $(HIPCC) -c $(CFLAGS) -o $@ $< + $(HIPCC) -c $(HIPCC_CFLAGS) -o $@ $< -$(BUILD_DIR)%.hip.$(OBJ_EXT): %.cpp +$(BUILD_DIR)/%.hip.$(OBJ_EXT): %.cpp @-mkdir -p $(dir $@) - $(HIPCC) -c $(CXXFLAGS) -o $@ $< + $(HIPCC) -c $(HIPCC_CXXFLAGS) -o $@ $< -$(BUILD_DIR)$(SHARED_PREFIX)ggml_hipblas.$(SHARED_EXT): $(HIP_OBJS) +$(RUNNERS_DIST_DIR)/rocm$(ROCM_VARIANT)/ollama_runner$(EXE_EXT): $(RUNNERS_DIST_DIR)/rocm$(ROCM_VARIANT)/$(SHARED_PREFIX)ggml_hipblas.$(SHARED_EXT) +$(RUNNERS_PAYLOAD_DIR)/rocm$(ROCM_VARIANT)/bin/ollama_runner$(EXE_EXT).gz: $(RUNNERS_PAYLOAD_DIR)/rocm$(ROCM_VARIANT)/bin/$(SHARED_PREFIX)ggml_hipblas.$(SHARED_EXT).gz + +$(RUNNERS_BUILD_DIR)/rocm$(ROCM_VARIANT)/$(SHARED_PREFIX)ggml_hipblas.$(SHARED_EXT): $(HIP_OBJS) $(ROCM_LIB_DEPS) @-mkdir -p $(dir $@) - $(HIPCC) --shared -lhipblas -lamdhip64 -lrocblas $(HIP_OBJS) -o $@ + $(HIPCC) --shared -lamdhip64 -L${ROCM_DEPS_DIR} $(foreach lib, $(ROCM_LIBS_SHORT), -l$(lib)) $(HIP_OBJS) -o $@ -$(BUILD_DIR)ollama_runner$(EXE_EXT): +$(ROCM_LIB_DEPS): + @-mkdir -p $(dir $@) + $(CP) $(HIP_LIB_DIR)/$(notdir $@)* $(dir $@) + +$(RUNNERS_BUILD_DIR)/$(DEFAULT_RUNNER)/ollama_runner$(EXE_EXT): *.go ./runner/*.go + @-mkdir -p $(dir $@) CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -o $@ ./runner -$(BUILD_DIR)ollama_runner_avx$(EXE_EXT): +$(RUNNERS_BUILD_DIR)/cpu_avx/ollama_runner$(EXE_EXT): *.go ./runner/*.go + @-mkdir -p $(dir $@) CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx -o $@ ./runner -$(BUILD_DIR)ollama_runner_avx2$(EXE_EXT): +$(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_runner$(EXE_EXT): *.go ./runner/*.go + @-mkdir -p $(dir $@) CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx,avx2 -o $@ ./runner -$(BUILD_DIR)ollama_runner_cuda$(EXE_EXT): $(BUILD_DIR)$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT) - CGO_ENABLED=1 GOARCH=$(ARCH) CGO_LDFLAGS=-L"$(CUDA_LIB)" go build -ldflags "-s -w" -tags avx,cuda -o $@ ./runner +$(RUNNERS_BUILD_DIR)/cuda_v11/ollama_runner$(EXE_EXT): $(RUNNERS_BUILD_DIR)/cuda_v11/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT) *.go ./runner/*.go + @-mkdir -p $(dir $@) + CGO_ENABLED=1 GOARCH=$(ARCH) CGO_LDFLAGS=-L"$(RUNNERS_BUILD_DIR)/cuda_v11/" go build -ldflags "-s -w" -tags avx,cuda -o $@ ./runner -$(BUILD_DIR)ollama_runner_rocm$(EXE_EXT): $(BUILD_DIR)$(SHARED_PREFIX)ggml_hipblas.$(SHARED_EXT) - CGO_ENABLED=1 GOARCH=$(ARCH) CGO_LDFLAGS=-L"$(HIP_LIB)" go build -ldflags "-s -w" -tags avx,rocm -o $@ ./runner +$(RUNNERS_BUILD_DIR)/cuda_v12/ollama_runner$(EXE_EXT): $(RUNNERS_BUILD_DIR)/cuda_v12/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT) *.go ./runner/*.go + @-mkdir -p $(dir $@) + CGO_ENABLED=1 GOARCH=$(ARCH) CGO_LDFLAGS=-L"$(RUNNERS_BUILD_DIR)/cuda_v12/" go build -ldflags "-s -w" -tags avx,cuda -o $@ ./runner + +$(RUNNERS_BUILD_DIR)/rocm$(ROCM_VARIANT)/ollama_runner$(EXE_EXT): $(RUNNERS_BUILD_DIR)/rocm$(ROCM_VARIANT)/$(SHARED_PREFIX)ggml_hipblas.$(SHARED_EXT) *.go ./runner/*.go + @-mkdir -p $(dir $@) + CGO_ENABLED=1 GOARCH=$(ARCH) CGO_LDFLAGS=-L"$(RUNNERS_BUILD_DIR)/rocm$(ROCM_VARIANT)/" go build -ldflags "-s -w" -tags avx,rocm -o $@ ./runner + + +$(ROCBLAS_DEP_MANIFEST): + @-mkdir -p $(dir $@) + @echo "Copying rocblas library..." + cd $(HIP_LIB_DIR)/rocblas/library/ && tar cf - . | (cd $(dir $@) && tar xf - ) + @echo "rocblas library copy complete" + + +$(RUNNERS_PAYLOAD_DIR)/%/bin/ollama_runner.gz: $(RUNNERS_BUILD_DIR)/%/ollama_runner + @-mkdir -p $(dir $@) + gzip --best -c $< > $@ +$(RUNNERS_PAYLOAD_DIR)/cuda_v11/bin/%.gz: $(RUNNERS_BUILD_DIR)/cuda_v11/% + @-mkdir -p $(dir $@) + gzip --best -c $< > $@ +$(RUNNERS_PAYLOAD_DIR)/cuda_v12/bin/%.gz: $(RUNNERS_BUILD_DIR)/cuda_v12/% + @-mkdir -p $(dir $@) + gzip --best -c $< > $@ +$(RUNNERS_PAYLOAD_DIR)/rocm$(ROCM_VARIANT)/bin/%.gz: $(RUNNERS_BUILD_DIR)/rocm$(ROCM_VARIANT)/% + @-mkdir -p $(dir $@) + gzip --best -c $< > $@ clean: - rm -rf $(BUILD_DIR) + rm -rf $(BUILD_DIR) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS) -.PHONY: runners clean ollama_runner$(EXE_EXT) ollama_runner_avx$(EXE_EXT) ollama_runner_avx2$(EXE_EXT) ollama_runner_cuda$(EXE_EXT) ollama_runner_rocm$(EXE_EXT) +.PHONY: all dist payload runners clean $(RUNNERS) # Handy debugging for make variables print-%: diff --git a/llama/runner/runner.go b/llama/runner/runner.go index 2abe2f9e..66cd23a2 100644 --- a/llama/runner/runner.go +++ b/llama/runner/runner.go @@ -10,6 +10,8 @@ import ( "math" "net" "net/http" + "os" + "path/filepath" "runtime" "strconv" "strings" @@ -146,7 +148,7 @@ func (s *Server) run(ctx context.Context) { case <-ctx.Done(): return default: - slog.Info("Processing batch", "seqs", len(s.seqs)) + slog.Debug("Processing batch", "seqs", len(s.seqs)) s.mu.Lock() for s.allNil() { s.cond.Wait() // Wait until an item is added @@ -186,6 +188,7 @@ func (s *Server) run(ctx context.Context) { err := s.lc.Decode(batch) if err != nil { + slog.Error("failed to decode batch", "error", err) panic("Failed to decode") } @@ -227,7 +230,7 @@ func (s *Server) run(ctx context.Context) { seq.numPredicted++ - slog.Info("sampled", "piece", piece) + slog.Debug("sampled", "piece", piece) // if it's an end of sequence token, break // TODO: just end this sequence @@ -463,7 +466,6 @@ type HealthResponse struct { // TODO (jmorganca): is it safe to do this concurrently with decoding? func (s *Server) health(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") - if err := json.NewEncoder(w).Encode(&HealthResponse{ Status: s.status, Progress: s.progress, @@ -475,17 +477,63 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) { func main() { mpath := flag.String("model", "", "Path to model binary file") - ppath := flag.String("projector", "", "Path to projector binary file") + ppath := flag.String("mmproj", "", "Path to projector binary file") parallel := flag.Int("parallel", 1, "Number of sequences to handle simultaneously") batchSize := flag.Int("batch-size", 512, "Batch size") - nGpuLayers := flag.Int("num-gpu", 0, "Number of layers to offload to GPU") + nGpuLayers := flag.Int("n-gpu-layers", 0, "Number of layers to offload to GPU") mainGpu := flag.Int("main-gpu", 0, "Main GPU") - flashAttention := flag.Bool("flash-attention", false, "Enable flash attention") - numCtx := flag.Int("num-ctx", 2048, "Context (or KV cache) size") + flashAttention := flag.Bool("flash-attn", false, "Enable flash attention") + numCtx := flag.Int("ctx-size", 2048, "Context (or KV cache) size") lpath := flag.String("lora", "", "Path to lora layer file") port := flag.Int("port", 8080, "Port to expose the server on") threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation") + + // TODO not yet implemented but wired to keep the parsing aligned + embedding := flag.Bool("embedding", false, "enable embedding vector output (default: disabled)") + logDisable := flag.Bool("log-disable", false, "disables logging to a file") + verbose := flag.Bool("verbose", false, "verbose output (default: disabled)") + f32 := flag.Bool("memory-f32", false, "use f32 instead of f16 for memory key+value (default: disabled) not recommended: doubles context memory required and no measurable increase in quality") + noMmap := flag.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)") + mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing") + tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions") + flag.Parse() + level := slog.LevelInfo + if *verbose { + level = slog.LevelDebug + } + handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{ + Level: level, + AddSource: true, + ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr { + if attr.Key == slog.SourceKey { + source := attr.Value.Any().(*slog.Source) + source.File = filepath.Base(source.File) + } + return attr + }, + }) + slog.SetDefault(slog.New(handler)) + + // TODO actually implement... + if *embedding { + slog.Warn("embeddings not yet support") + } + if *logDisable { + slog.Info("ignoring --log-disable") + } + if *f32 { + slog.Warn("memory-f32 not yet supported") + } + if *noMmap { + slog.Warn("no-mmap not yet supported") + } + if *mlock { + slog.Warn("mlock not yet supported") + } + if *tensorSplit != "" { + slog.Warn("tensor-split not yet implemented") + } server := &Server{ numCtx: *numCtx, @@ -498,7 +546,7 @@ func main() { // load the model llama.BackendInit() params := llama.NewModelParams(*nGpuLayers, *mainGpu, func(progress float32) { - slog.Info("Loading model", "progress %", math.Round(float64(progress*100))) + slog.Debug("Loading model", "progress %", math.Round(float64(progress*100))) server.progress = progress }) server.model = llama.LoadModelFromFile(*mpath, params) @@ -531,7 +579,7 @@ func main() { defer listener.Close() mux := http.NewServeMux() - mux.HandleFunc("/embeddings", server.embeddings) + mux.HandleFunc("/embedding", server.embeddings) mux.HandleFunc("/completion", server.completion) mux.HandleFunc("/health", server.health) @@ -539,7 +587,7 @@ func main() { Handler: mux, } - server.status = "ready" + server.status = "ok" log.Println("Server listening on", addr) if err := httpServer.Serve(listener); err != nil { diff --git a/llm/llm.go b/llm/llm.go index 87cea5cd..29e0b62a 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -12,6 +12,7 @@ package llm import "C" import ( "fmt" + "log/slog" "unsafe" ) @@ -37,3 +38,68 @@ func Quantize(infile, outfile string, ftype fileType) error { return nil } + +type loadedModel struct { + model *C.struct_llama_model +} + +func loadModel(modelfile string, vocabOnly bool) (*loadedModel, error) { + // TODO figure out how to quiet down the logging so we don't have 2 copies of the model metadata showing up + slog.Info("XXX initializing default model params") + params := C.llama_model_default_params() + params.vocab_only = C.bool(vocabOnly) + + cmodelfile := C.CString(modelfile) + defer C.free(unsafe.Pointer(cmodelfile)) + + slog.Info("XXX loading model", "model", modelfile) + model := C.llama_load_model_from_file(cmodelfile, params) + if model == nil { + return nil, fmt.Errorf("failed to load model %s", modelfile) + } + return &loadedModel{model}, nil +} + +func freeModel(model *loadedModel) { + C.llama_free_model(model.model) +} + +func tokenize(model *loadedModel, content string) ([]int, error) { + ccontent := C.CString(content) + defer C.free(unsafe.Pointer(ccontent)) + + len := len(content) + 2 + tokens := make([]C.int32_t, len) + + tokenCount := C.llama_tokenize(model.model, ccontent, C.int32_t(len), &tokens[0], C.int32_t(len), true, true) + if tokenCount < 0 { + slog.Info("XXX got negative response", "count", tokenCount) + tokens = make([]C.int32_t, int(tokenCount)) + tokenCount = C.llama_tokenize(model.model, ccontent, C.int32_t(len), &tokens[0], tokenCount, true, true) + } else if tokenCount == 0 { + return nil, nil + } + ret := make([]int, tokenCount) + for i := range int(tokenCount) { + ret[i] = int(tokens[i]) + } + slog.Debug("XXX tokenized", "tokens", tokens, "content", content) + return ret, nil +} + +func detokenize(model *loadedModel, tokens []int) string { + slog.Info("XXX in CGO detokenize") + var resp string + for _, token := range tokens { + buf := make([]C.char, 8) + nTokens := C.llama_token_to_piece(model.model, C.int(token), &buf[0], 8, 0, true) + if nTokens < 0 { + buf = make([]C.char, -nTokens) + nTokens = C.llama_token_to_piece(model.model, C.int(token), &buf[0], -nTokens, 0, true) + } + tokString := C.GoStringN(&buf[0], nTokens) + resp += tokString + } + slog.Debug("XXX detokenized", "tokens", tokens, "content", resp) + return resp +} diff --git a/llm/server.go b/llm/server.go index 9c08f1bb..aa77d6fe 100644 --- a/llm/server.go +++ b/llm/server.go @@ -50,6 +50,7 @@ type llmServer struct { status *StatusWriter options api.Options numParallel int + model *loadedModel estimate MemoryEstimate totalLayers uint64 @@ -322,6 +323,9 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr } server := filepath.Join(dir, "ollama_llama_server") + if envconfig.NewRunners() { + server = filepath.Join(dir, "ollama_runner") + } if runtime.GOOS == "windows" { server += ".exe" } @@ -329,6 +333,9 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr // Detect tmp cleaners wiping out the file _, err := os.Stat(server) if errors.Is(err, os.ErrNotExist) { + if envconfig.NewRunners() { + return nil, fmt.Errorf("experimental runners enabled, but not present in this build") + } slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err) err = Init() if err != nil { @@ -337,11 +344,16 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr } } + m, err := loadModel(model, true) + if err != nil { + return nil, fmt.Errorf("unable to load model for tokenization %w", err) + } s := &llmServer{ port: port, cmd: exec.Command(server, finalParams...), status: NewStatusWriter(os.Stderr), options: opts, + model: m, estimate: estimate, numParallel: numParallel, sem: semaphore.NewWeighted(int64(numParallel)), @@ -816,16 +828,16 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu continue } + // slog.Debug("got line", "line", string(line)) evt, ok := bytes.CutPrefix(line, []byte("data: ")) if !ok { - return fmt.Errorf("error parsing llm response stream: %s", line) + evt = line } var c completion if err := json.Unmarshal(evt, &c); err != nil { return fmt.Errorf("error unmarshalling llm prediction response: %v", err) } - switch { case strings.TrimSpace(c.Content) == lastToken: tokenRepeat++ @@ -927,7 +939,7 @@ func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, err } if resp.StatusCode >= 400 { - log.Printf("llm encode error: %s", body) + log.Printf("llm embedding error: %s", body) return nil, fmt.Errorf("%s", body) } @@ -948,47 +960,7 @@ type TokenizeResponse struct { } func (s *llmServer) Tokenize(ctx context.Context, content string) ([]int, error) { - // Make sure the server is ready - status, err := s.getServerStatus(ctx) - if err != nil { - return nil, err - } else if status != ServerStatusReady && status != ServerStatusNoSlotsAvailable { - return nil, fmt.Errorf("unexpected server status: %s", status.ToString()) - } - - data, err := json.Marshal(TokenizeRequest{Content: content}) - if err != nil { - return nil, fmt.Errorf("marshaling encode data: %w", err) - } - - req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/tokenize", s.port), bytes.NewBuffer(data)) - if err != nil { - return nil, fmt.Errorf("encode request: %w", err) - } - req.Header.Set("Content-Type", "application/json") - - resp, err := http.DefaultClient.Do(req) - if err != nil { - return nil, fmt.Errorf("do encode request: %w", err) - } - defer resp.Body.Close() - - body, err := io.ReadAll(resp.Body) - if err != nil { - return nil, fmt.Errorf("read encode request: %w", err) - } - - if resp.StatusCode >= 400 { - log.Printf("llm encode error: %s", body) - return nil, fmt.Errorf("%s", body) - } - - var encoded TokenizeResponse - if err := json.Unmarshal(body, &encoded); err != nil { - return nil, fmt.Errorf("unmarshal encode response: %w", err) - } - - return encoded.Tokens, nil + return tokenize(s.model, content) } type DetokenizeRequest struct { @@ -1000,50 +972,13 @@ type DetokenizeResponse struct { } func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error) { - // Make sure the server is ready - status, err := s.getServerStatus(ctx) - if err != nil { - return "", err - } else if status != ServerStatusReady && status != ServerStatusNoSlotsAvailable { - return "", fmt.Errorf("unexpected server status: %s", status.ToString()) - } - - data, err := json.Marshal(DetokenizeRequest{Tokens: tokens}) - if err != nil { - return "", fmt.Errorf("marshaling decode data: %w", err) - } - - req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/detokenize", s.port), bytes.NewBuffer(data)) - if err != nil { - return "", fmt.Errorf("decode request: %w", err) - } - req.Header.Set("Content-Type", "application/json") - - resp, err := http.DefaultClient.Do(req) - if err != nil { - return "", fmt.Errorf("do decode request: %w", err) - } - defer resp.Body.Close() - - body, err := io.ReadAll(resp.Body) - if err != nil { - return "", fmt.Errorf("read decode request: %w", err) - } - - if resp.StatusCode >= 400 { - log.Printf("llm decode error: %s", body) - return "", fmt.Errorf("%s", body) - } - - var decoded DetokenizeResponse - if err := json.Unmarshal(body, &decoded); err != nil { - return "", fmt.Errorf("unmarshal encode response: %w", err) - } - - return decoded.Content, nil + return detokenize(s.model, tokens), nil } func (s *llmServer) Close() error { + if s.model != nil { + freeModel(s.model) + } if s.cmd != nil { slog.Debug("stopping llama server") if err := s.cmd.Process.Kill(); err != nil {