Runtime selection of new or old runners

This adjusts the new runners to comingle with existing runners so we can use an
env var to toggle the new runners on.
This commit is contained in:
Daniel Hiltgen 2024-08-01 08:54:44 -07:00 committed by jmorganca
parent 8527028bf4
commit 751009a5d7
5 changed files with 399 additions and 158 deletions

View File

@ -140,6 +140,8 @@ var (
SchedSpread = Bool("OLLAMA_SCHED_SPREAD")
// IntelGPU enables experimental Intel GPU detection.
IntelGPU = Bool("OLLAMA_INTEL_GPU")
// Set via OLLAMA_NEW_RUNNERS in the environment
NewRunners = Bool("OLLAMA_NEW_RUNNERS")
)
func String(s string) func() string {
@ -250,6 +252,7 @@ func AsMap() map[string]EnvVar {
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"},
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
"OLLAMA_NEW_RUNNERS": {"OLLAMA_NEW_RUNNERS", NewRunners(), "Enable new experimental runners"},
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir(), "Location for runners"},
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},

View File

@ -4,41 +4,102 @@ ifeq ($(ARCH),x86_64)
ARCH := amd64
endif
ifneq (,$(findstring MINGW,$(OS))$(findstring MSYS,$(OS)))
OS := Windows
OS := windows
else ifeq ($(OS),Linux)
OS := linux
else ifeq ($(OS),Darwin)
OS := darwin
endif
comma:= ,
empty:=
space:= $(empty) $(empty)
export CGO_CFLAGS_ALLOW = -mfma|-mf16c
export CGO_CXXFLAGS_ALLOW = -mfma|-mf16c
export HIP_PLATFORM = amd
SRC_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
BUILD_DIR := $(SRC_DIR)/build/$(OS)/$(ARCH)/
BUILD_DIR = $(SRC_DIR)build/$(OS)-$(ARCH)
DIST_BASE = $(abspath $(SRC_DIR)/../dist/$(OS)-$(ARCH))
RUNNERS_DIST_DIR = $(DIST_BASE)/ollama_runners
RUNNERS_PAYLOAD_DIR = $(abspath $(SRC_DIR)/../llm/build/$(OS)/$(patsubst amd64,x86_64,$(ARCH)))
RUNNERS_BUILD_DIR = $(BUILD_DIR)/ollama_runners
DEFAULT_RUNNER := $(if $(and $(filter darwin,$(OS)),$(filter arm64,$(ARCH))),metal,cpu)
ifeq ($(OS),Windows)
CUDA_LIBS_SHORT := cublas cudart cublasLt
ROCM_LIBS_SHORT := hipblas rocblas
ifeq ($(OS),windows)
SRC_DIR := $(shell cygpath -m -s "$(SRC_DIR)")
OBJ_EXT := obj
SHARED_EXT := dll
EXE_EXT := .exe
SHARED_PREFIX := ""
CUDA_LIB := $(shell cygpath -w -s "$(CUDA_PATH)\lib\x64")
HIP_LIB := $(shell cygpath -w -s "$(HIP_PATH)\lib")
NVCC := nvcc
# If HIP_PATH has spaces, hipcc trips over them when subprocessing
HIP_PATH := $(shell cygpath -m -s "$(HIP_PATH)\")
export HIP_PATH
HIPCC := $(HIP_PATH)bin/hipcc.bin.exe
else ifeq ($(OS),Linux)
SHARED_PREFIX :=
# TODO needs work for multiple cuda versions on windows
CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\.."))
CUDA_11=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null)
CUDA_12=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null)
CUDA_11_LIB_DIR := $(CUDA_11)/bin
CUDA_12_LIB_DIR := $(CUDA_12)/bin
NVCC := $(shell X=$$(which nvcc 2>/dev/null) && cygpath -m -s "$$X")
ifneq ($(HIP_PATH),)
HIP_LIB_DIR := $(shell cygpath -m -s "$(HIP_PATH)\bin")
# If HIP_PATH has spaces, hipcc trips over them when subprocessing
HIP_PATH := $(shell cygpath -m -s "$(HIP_PATH)\")
export HIP_PATH
HIPCC := $(HIP_PATH)bin/hipcc.bin.exe
endif
CP := cp
CUDA_LIBS = $(wildcard $(addsuffix 64*.$(SHARED_EXT),$(addprefix $(CUDA_LIB_DIR)/$(SHARED_PREFIX),$(CUDA_LIBS_SHORT))))
else ifeq ($(OS),linux)
CP := cp -a
OBJ_EXT := o
SHARED_EXT := so
SHARED_PREFIX := lib
CUDA_PATH?=/usr/local/cuda
HIP_PATH?=/opt/rocm
CUDA_LIB := "$(CUDA_PATH)/lib64"
HIP_LIB := "$(HIP_PATH)/lib"
NVCC := nvcc
HIPCC := hipcc
HIP_LIB_DIR := $(HIP_PATH)/lib
HIPCC := $(shell X=$$(which hipcc 2>/dev/null) && echo $$X)
CUDA_PATH?=/usr/local/cuda
CUDA_11=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null)
CUDA_12=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null)
CUDA_11_LIB_DIR := $(CUDA_11)/lib64
CUDA_12_LIB_DIR := $(CUDA_12)/lib64
else
OBJ_EXT := o
SHARED_EXT := so
CP := cp -a
endif
CUDA_11_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(CUDA_11_LIB_DIR)/$(SHARED_PREFIX),$(CUDA_LIBS_SHORT))))
CUDA_12_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(CUDA_12_LIB_DIR)/$(SHARED_PREFIX),$(CUDA_LIBS_SHORT))))
NVCC_11 = $(CUDA_11)/bin/nvcc
NVCC_12 = $(CUDA_12)/bin/nvcc
CUDA_DEPS_DIR = $(DIST_BASE)cuda/
ROCM_DEPS_DIR = $(DIST_BASE)rocm/
ifneq ($(CUDA_11),)
CUDA_11_VARIANT= _v11
CUDA_11_LIB_DEPS = $(addprefix $(CUDA_DEPS_DIR),$(notdir $(CUDA_11_LIBS)))
endif
ifneq ($(CUDA_12),)
CUDA_12_VARIANT= _v12
CUDA_12_LIB_DEPS = $(addprefix $(CUDA_DEPS_DIR),$(notdir $(CUDA_12_LIBS)))
endif
ifeq ($(OLLAMA_SKIP_ROCM_GENERATE),)
ifneq ($(HIPCC),)
ROCM_VERSION := $(subst $(space),.,$(wordlist 1,2,$(subst .,$(space),$(word 3,$(subst -,$(space),$(filter HIP version: %,$(shell $(HIPCC) --version)))))))
ifneq (,$(ROCM_VERSION))
ROCM_VARIANT = _v$(ROCM_VERSION)
endif
ROCM_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT),$(addprefix $(HIP_LIB_DIR)/$(SHARED_PREFIX),$(ROCM_LIBS_SHORT))))
ROCM_LIB_DEPS = $(addprefix $(ROCM_DEPS_DIR),$(notdir $(ROCM_LIBS)))
ROCBLAS_DEP_MANIFEST = $(ROCM_DEPS_DIR)/rocblas/library/TensileManifest.txt
endif
endif
CUDA_SRCS := \
@ -51,21 +112,19 @@ CUDA_SRCS := \
$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu) \
ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp
CUDA_OBJS := $(CUDA_SRCS:.cu=.cuda.$(OBJ_EXT))
CUDA_OBJS := $(CUDA_OBJS:.c=.cuda.$(OBJ_EXT))
CUDA_OBJS := $(addprefix $(BUILD_DIR),$(CUDA_OBJS:.cpp=.cuda.$(OBJ_EXT)))
CUDA_11_OBJS := $(CUDA_SRCS:.cu=.cuda.$(OBJ_EXT))
CUDA_11_OBJS := $(CUDA_11_OBJS:.c=.cuda.$(OBJ_EXT))
CUDA_11_OBJS := $(addprefix $(BUILD_DIR)/cuda_v11/,$(CUDA_11_OBJS:.cpp=.cuda.$(OBJ_EXT)))
CUDA_12_OBJS := $(CUDA_SRCS:.cu=.cuda.$(OBJ_EXT))
CUDA_12_OBJS := $(CUDA_12_OBJS:.c=.cuda.$(OBJ_EXT))
CUDA_12_OBJS := $(addprefix $(BUILD_DIR)/cuda_v12/,$(CUDA_12_OBJS:.cpp=.cuda.$(OBJ_EXT)))
HIP_OBJS := $(CUDA_SRCS:.cu=.hip.$(OBJ_EXT))
HIP_OBJS := $(HIP_OBJS:.c=.hip.$(OBJ_EXT))
HIP_OBJS := $(addprefix $(BUILD_DIR),$(HIP_OBJS:.cpp=.hip.$(OBJ_EXT)))
HIP_OBJS := $(addprefix $(BUILD_DIR)/,$(HIP_OBJS:.cpp=.hip.$(OBJ_EXT)))
CUDA_FLAGS := \
--generate-code=arch=compute_50,code=[compute_50,sm_50] \
--generate-code=arch=compute_52,code=[compute_52,sm_52] \
--generate-code=arch=compute_61,code=[compute_61,sm_61] \
--generate-code=arch=compute_70,code=[compute_70,sm_70] \
--generate-code=arch=compute_75,code=[compute_75,sm_75] \
--generate-code=arch=compute_80,code=[compute_80,sm_80] \
-t4 \
-DGGML_CUDA_DMMV_X=32 \
-DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \
-DGGML_USE_CUDA=1 \
@ -82,6 +141,34 @@ CUDA_FLAGS := \
-I. \
-O3
CUDA_11_FLAGS := \
--generate-code=arch=compute_50,code=[compute_50,sm_50] \
--generate-code=arch=compute_52,code=[compute_52,sm_52] \
--generate-code=arch=compute_53,code=[compute_53,sm_53] \
--generate-code=arch=compute_60,code=[compute_60,sm_60] \
--generate-code=arch=compute_61,code=[compute_61,sm_61] \
--generate-code=arch=compute_62,code=[compute_62,sm_62] \
--generate-code=arch=compute_70,code=[compute_70,sm_70] \
--generate-code=arch=compute_72,code=[compute_72,sm_72] \
--generate-code=arch=compute_75,code=[compute_75,sm_75] \
--generate-code=arch=compute_80,code=[compute_80,sm_80] \
--generate-code=arch=compute_86,code=[compute_86,sm_86]
CUDA_12_FLAGS := \
--generate-code=arch=compute_60,code=[compute_60,sm_60] \
--generate-code=arch=compute_61,code=[compute_61,sm_61] \
--generate-code=arch=compute_62,code=[compute_62,sm_62] \
--generate-code=arch=compute_70,code=[compute_70,sm_70] \
--generate-code=arch=compute_72,code=[compute_72,sm_72] \
--generate-code=arch=compute_75,code=[compute_75,sm_75] \
--generate-code=arch=compute_80,code=[compute_80,sm_80] \
--generate-code=arch=compute_86,code=[compute_86,sm_86] \
--generate-code=arch=compute_87,code=[compute_87,sm_87] \
--generate-code=arch=compute_89,code=[compute_89,sm_89] \
--generate-code=arch=compute_90,code=[compute_90,sm_90] \
--generate-code=arch=compute_90a,code=[compute_90a,sm_90a] \
-DGGML_CUDA_USE_GRAPHS=on
HIP_ARCHS := gfx900 gfx940 gfx941 gfx942 gfx1010 gfx1012 gfx1030 gfx1100 gfx1101 gfx1102
LINUX_HIP_ARCHS := gfx906:xnack- gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-
@ -111,79 +198,181 @@ HIP_FLAGS := \
-I. \
$(foreach arch, $(HIP_ARCHS), --offload-arch=$(arch))
ifeq ($(OS), Linux)
HIP_FLAGS += $(foreach arch, $(LINUX_HIP_ARCHS), --offload-arch=$(arch)) -fPIC
CUDA_FLAGS += -fPIC
CFLAGS += -Xcompiler -fPIC -D_GNU_SOURCE
CXXFLAGS += -Xcompiler -fPIC -D_GNU_SOURCE
else ifeq ($(OS),Windows)
ifeq ($(OS),linux)
HIP_FLAGS += $(foreach arch, $(LINUX_HIP_ARCHS), --offload-arch=$(arch)) -fPIC -Wno-unused-function
CUDA_FLAGS += -fPIC -Wno-unused-function
NVCC_CFLAGS = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
NVCC_CXXFLAGS = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
HIPCC_CFLAGS = $(CFLAGS) -fPIC -D_GNU_SOURCE
HIPCC_CXXFLAGS = $(CXXFLAGS) -fPIC -D_GNU_SOURCE
else ifeq ($(OS),windows)
HIP_FLAGS += -Xclang --dependent-lib=msvcrt
CFLAGS += -D_WIN32_WINNT=0x602
CXXFLAGS += -D_WIN32_WINNT=0x602
NVCC_CFLAGS = $(CFLAGS)
NVCC_CXXFLAGS = $(CXXFLAGS)
HIPCC_CFLAGS = $(CFLAGS)
HIPCC_CXXFLAGS = $(CXXFLAGS)
endif
RUNNERS := $(BUILD_DIR)ollama_runner$(EXE_EXT)
ifeq ($(OLLAMA_SKIP_CPU_GENERATE),)
RUNNERS := $(DEFAULT_RUNNER)
ifeq ($(ARCH),amd64)
RUNNERS += $(BUILD_DIR)ollama_runner_avx$(EXE_EXT) $(BUILD_DIR)ollama_runner_avx2$(EXE_EXT)
RUNNERS += cpu_avx cpu_avx2
endif
ifneq ($(NVCC),)
RUNNERS += $(BUILD_DIR)ollama_runner_cuda$(EXE_EXT)
endif
ifeq ($(OLLAMA_SKIP_CUDA_GENERATE),)
ifneq ($(CUDA_11),)
RUNNERS += cuda_v11
endif
ifneq ($(CUDA_12),)
RUNNERS += cuda_v12
endif
endif
ifeq ($(OLLAMA_SKIP_ROCM_GENERATE),)
ifneq ($(HIPCC),)
RUNNERS += $(BUILD_DIR)ollama_runner_rocm$(EXE_EXT)
RUNNERS += rocm$(ROCM_VARIANT)
endif
endif
runners: $(RUNNERS)
DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_runner$(EXE_EXT),$(RUNNERS)))
PAYLOAD_RUNNERS = $(addprefix $(RUNNERS_PAYLOAD_DIR)/,$(addsuffix /ollama_runner$(EXE_EXT).gz,$(addsuffix /bin,$(RUNNERS))))
BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_runner$(EXE_EXT),$(RUNNERS)))
$(BUILD_DIR)%.cuda.$(OBJ_EXT): %.cu
all: dist payload
dist: $(DIST_RUNNERS) $(ROCBLAS_DEP_MANIFEST)
ifeq ($(OS),windows)
# Unused on windows as we don't cary the payloads in the go binary
payload:
else
payload: $(PAYLOAD_RUNNERS)
endif
runners: $(BUILD_RUNNERS)
$(BUILD_DIR)/cuda_v11/%.cuda.$(OBJ_EXT): %.cu
@-mkdir -p $(dir $@)
$(NVCC) -c $(CUDA_FLAGS) -o $@ $<
$(NVCC_11) -c $(CUDA_FLAGS) $(CUDA_11_FLAGS) -o $@ $<
$(BUILD_DIR)%.cuda.$(OBJ_EXT): %.c
$(BUILD_DIR)/cuda_v11/%.cuda.$(OBJ_EXT): %.c
@-mkdir -p $(dir $@)
$(NVCC) -c $(CFLAGS) -o $@ $<
$(NVCC_11) -c $(NVCC_CFLAGS) -o $@ $<
$(BUILD_DIR)%.cuda.$(OBJ_EXT): %.cpp
$(BUILD_DIR)/cuda_v11/%.cuda.$(OBJ_EXT): %.cpp
@-mkdir -p $(dir $@)
$(NVCC) -c $(CXXFLAGS) -o $@ $<
$(NVCC_11) -c $(NVCC_CXXFLAGS) -o $@ $<
$(BUILD_DIR)$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT): $(CUDA_OBJS)
$(BUILD_DIR)/cuda_v12/%.cuda.$(OBJ_EXT): %.cu
@-mkdir -p $(dir $@)
$(NVCC) --shared -lcuda -lcublas -lcudart -lcublasLt $(CUDA_FLAGS) $(CUDA_OBJS) -o $@
$(NVCC_12) -c $(CUDA_FLAGS) $(CUDA_12_FLAGS) -o $@ $<
$(BUILD_DIR)%.hip.$(OBJ_EXT): %.cu
$(BUILD_DIR)/cuda_v12/%.cuda.$(OBJ_EXT): %.c
@-mkdir -p $(dir $@)
$(NVCC_12) -c $(NVCC_CFLAGS) -o $@ $<
$(BUILD_DIR)/cuda_v12/%.cuda.$(OBJ_EXT): %.cpp
@-mkdir -p $(dir $@)
$(NVCC_12) -c $(NVCC_CXXFLAGS) -o $@ $<
$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
@-mkdir -p $(dir $@)
cp $< $@
$(RUNNERS_DIST_DIR)/cuda_v11/ollama_runner$(EXE_EXT): $(RUNNERS_DIST_DIR)/cuda_v11/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT)
$(RUNNERS_PAYLOAD_DIR)/cuda_v11/bin/ollama_runner$(EXE_EXT).gz: $(RUNNERS_PAYLOAD_DIR)/cuda_v11/bin/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT).gz
$(RUNNERS_DIST_DIR)/cuda_v12/ollama_runner$(EXE_EXT): $(RUNNERS_DIST_DIR)/cuda_v12/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT)
$(RUNNERS_PAYLOAD_DIR)/cuda_v12/bin/ollama_runner$(EXE_EXT).gz: $(RUNNERS_PAYLOAD_DIR)/cuda_v12/bin/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT).gz
$(RUNNERS_BUILD_DIR)/cuda_v11/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT): $(CUDA_11_OBJS) $(CUDA_11_LIB_DEPS)
@-mkdir -p $(dir $@)
$(NVCC_11) --shared -lcuda -L${CUDA_DEPS_DIR} $(foreach lib, $(CUDA_LIBS_SHORT), -l$(lib)) $(CUDA_FLAGS) $(CUDA_11_FLAGS) $(CUDA_11_OBJS) -o $@
$(RUNNERS_BUILD_DIR)/cuda_v12/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT): $(CUDA_12_OBJS) $(CUDA_12_LIB_DEPS)
@-mkdir -p $(dir $@)
$(NVCC_12) --shared -lcuda -L${CUDA_DEPS_DIR} $(foreach lib, $(CUDA_LIBS_SHORT), -l$(lib)) $(CUDA_FLAGS) $(CUDA_12_FLAGS) $(CUDA_12_OBJS) -o $@
$(CUDA_11_LIB_DEPS):
@-mkdir -p $(dir $@)
$(CP) $(CUDA_11_LIB_DIR)/$(notdir $@)* $(dir $@)
$(CUDA_12_LIB_DEPS):
@-mkdir -p $(dir $@)
$(CP) $(CUDA_12_LIB_DIR)/$(notdir $@)* $(dir $@)
$(BUILD_DIR)/%.hip.$(OBJ_EXT): %.cu
@-mkdir -p $(dir $@)
$(HIPCC) -c $(HIP_FLAGS) -o $@ $<
$(BUILD_DIR)%.hip.$(OBJ_EXT): %.c
$(BUILD_DIR)/%.hip.$(OBJ_EXT): %.c
@-mkdir -p $(dir $@)
$(HIPCC) -c $(CFLAGS) -o $@ $<
$(HIPCC) -c $(HIPCC_CFLAGS) -o $@ $<
$(BUILD_DIR)%.hip.$(OBJ_EXT): %.cpp
$(BUILD_DIR)/%.hip.$(OBJ_EXT): %.cpp
@-mkdir -p $(dir $@)
$(HIPCC) -c $(CXXFLAGS) -o $@ $<
$(HIPCC) -c $(HIPCC_CXXFLAGS) -o $@ $<
$(BUILD_DIR)$(SHARED_PREFIX)ggml_hipblas.$(SHARED_EXT): $(HIP_OBJS)
$(RUNNERS_DIST_DIR)/rocm$(ROCM_VARIANT)/ollama_runner$(EXE_EXT): $(RUNNERS_DIST_DIR)/rocm$(ROCM_VARIANT)/$(SHARED_PREFIX)ggml_hipblas.$(SHARED_EXT)
$(RUNNERS_PAYLOAD_DIR)/rocm$(ROCM_VARIANT)/bin/ollama_runner$(EXE_EXT).gz: $(RUNNERS_PAYLOAD_DIR)/rocm$(ROCM_VARIANT)/bin/$(SHARED_PREFIX)ggml_hipblas.$(SHARED_EXT).gz
$(RUNNERS_BUILD_DIR)/rocm$(ROCM_VARIANT)/$(SHARED_PREFIX)ggml_hipblas.$(SHARED_EXT): $(HIP_OBJS) $(ROCM_LIB_DEPS)
@-mkdir -p $(dir $@)
$(HIPCC) --shared -lhipblas -lamdhip64 -lrocblas $(HIP_OBJS) -o $@
$(HIPCC) --shared -lamdhip64 -L${ROCM_DEPS_DIR} $(foreach lib, $(ROCM_LIBS_SHORT), -l$(lib)) $(HIP_OBJS) -o $@
$(BUILD_DIR)ollama_runner$(EXE_EXT):
$(ROCM_LIB_DEPS):
@-mkdir -p $(dir $@)
$(CP) $(HIP_LIB_DIR)/$(notdir $@)* $(dir $@)
$(RUNNERS_BUILD_DIR)/$(DEFAULT_RUNNER)/ollama_runner$(EXE_EXT): *.go ./runner/*.go
@-mkdir -p $(dir $@)
CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -o $@ ./runner
$(BUILD_DIR)ollama_runner_avx$(EXE_EXT):
$(RUNNERS_BUILD_DIR)/cpu_avx/ollama_runner$(EXE_EXT): *.go ./runner/*.go
@-mkdir -p $(dir $@)
CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx -o $@ ./runner
$(BUILD_DIR)ollama_runner_avx2$(EXE_EXT):
$(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_runner$(EXE_EXT): *.go ./runner/*.go
@-mkdir -p $(dir $@)
CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx,avx2 -o $@ ./runner
$(BUILD_DIR)ollama_runner_cuda$(EXE_EXT): $(BUILD_DIR)$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT)
CGO_ENABLED=1 GOARCH=$(ARCH) CGO_LDFLAGS=-L"$(CUDA_LIB)" go build -ldflags "-s -w" -tags avx,cuda -o $@ ./runner
$(RUNNERS_BUILD_DIR)/cuda_v11/ollama_runner$(EXE_EXT): $(RUNNERS_BUILD_DIR)/cuda_v11/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT) *.go ./runner/*.go
@-mkdir -p $(dir $@)
CGO_ENABLED=1 GOARCH=$(ARCH) CGO_LDFLAGS=-L"$(RUNNERS_BUILD_DIR)/cuda_v11/" go build -ldflags "-s -w" -tags avx,cuda -o $@ ./runner
$(BUILD_DIR)ollama_runner_rocm$(EXE_EXT): $(BUILD_DIR)$(SHARED_PREFIX)ggml_hipblas.$(SHARED_EXT)
CGO_ENABLED=1 GOARCH=$(ARCH) CGO_LDFLAGS=-L"$(HIP_LIB)" go build -ldflags "-s -w" -tags avx,rocm -o $@ ./runner
$(RUNNERS_BUILD_DIR)/cuda_v12/ollama_runner$(EXE_EXT): $(RUNNERS_BUILD_DIR)/cuda_v12/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT) *.go ./runner/*.go
@-mkdir -p $(dir $@)
CGO_ENABLED=1 GOARCH=$(ARCH) CGO_LDFLAGS=-L"$(RUNNERS_BUILD_DIR)/cuda_v12/" go build -ldflags "-s -w" -tags avx,cuda -o $@ ./runner
$(RUNNERS_BUILD_DIR)/rocm$(ROCM_VARIANT)/ollama_runner$(EXE_EXT): $(RUNNERS_BUILD_DIR)/rocm$(ROCM_VARIANT)/$(SHARED_PREFIX)ggml_hipblas.$(SHARED_EXT) *.go ./runner/*.go
@-mkdir -p $(dir $@)
CGO_ENABLED=1 GOARCH=$(ARCH) CGO_LDFLAGS=-L"$(RUNNERS_BUILD_DIR)/rocm$(ROCM_VARIANT)/" go build -ldflags "-s -w" -tags avx,rocm -o $@ ./runner
$(ROCBLAS_DEP_MANIFEST):
@-mkdir -p $(dir $@)
@echo "Copying rocblas library..."
cd $(HIP_LIB_DIR)/rocblas/library/ && tar cf - . | (cd $(dir $@) && tar xf - )
@echo "rocblas library copy complete"
$(RUNNERS_PAYLOAD_DIR)/%/bin/ollama_runner.gz: $(RUNNERS_BUILD_DIR)/%/ollama_runner
@-mkdir -p $(dir $@)
gzip --best -c $< > $@
$(RUNNERS_PAYLOAD_DIR)/cuda_v11/bin/%.gz: $(RUNNERS_BUILD_DIR)/cuda_v11/%
@-mkdir -p $(dir $@)
gzip --best -c $< > $@
$(RUNNERS_PAYLOAD_DIR)/cuda_v12/bin/%.gz: $(RUNNERS_BUILD_DIR)/cuda_v12/%
@-mkdir -p $(dir $@)
gzip --best -c $< > $@
$(RUNNERS_PAYLOAD_DIR)/rocm$(ROCM_VARIANT)/bin/%.gz: $(RUNNERS_BUILD_DIR)/rocm$(ROCM_VARIANT)/%
@-mkdir -p $(dir $@)
gzip --best -c $< > $@
clean:
rm -rf $(BUILD_DIR)
rm -rf $(BUILD_DIR) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
.PHONY: runners clean ollama_runner$(EXE_EXT) ollama_runner_avx$(EXE_EXT) ollama_runner_avx2$(EXE_EXT) ollama_runner_cuda$(EXE_EXT) ollama_runner_rocm$(EXE_EXT)
.PHONY: all dist payload runners clean $(RUNNERS)
# Handy debugging for make variables
print-%:

View File

@ -10,6 +10,8 @@ import (
"math"
"net"
"net/http"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
@ -146,7 +148,7 @@ func (s *Server) run(ctx context.Context) {
case <-ctx.Done():
return
default:
slog.Info("Processing batch", "seqs", len(s.seqs))
slog.Debug("Processing batch", "seqs", len(s.seqs))
s.mu.Lock()
for s.allNil() {
s.cond.Wait() // Wait until an item is added
@ -186,6 +188,7 @@ func (s *Server) run(ctx context.Context) {
err := s.lc.Decode(batch)
if err != nil {
slog.Error("failed to decode batch", "error", err)
panic("Failed to decode")
}
@ -227,7 +230,7 @@ func (s *Server) run(ctx context.Context) {
seq.numPredicted++
slog.Info("sampled", "piece", piece)
slog.Debug("sampled", "piece", piece)
// if it's an end of sequence token, break
// TODO: just end this sequence
@ -463,7 +466,6 @@ type HealthResponse struct {
// TODO (jmorganca): is it safe to do this concurrently with decoding?
func (s *Server) health(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
if err := json.NewEncoder(w).Encode(&HealthResponse{
Status: s.status,
Progress: s.progress,
@ -475,17 +477,63 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
func main() {
mpath := flag.String("model", "", "Path to model binary file")
ppath := flag.String("projector", "", "Path to projector binary file")
ppath := flag.String("mmproj", "", "Path to projector binary file")
parallel := flag.Int("parallel", 1, "Number of sequences to handle simultaneously")
batchSize := flag.Int("batch-size", 512, "Batch size")
nGpuLayers := flag.Int("num-gpu", 0, "Number of layers to offload to GPU")
nGpuLayers := flag.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
mainGpu := flag.Int("main-gpu", 0, "Main GPU")
flashAttention := flag.Bool("flash-attention", false, "Enable flash attention")
numCtx := flag.Int("num-ctx", 2048, "Context (or KV cache) size")
flashAttention := flag.Bool("flash-attn", false, "Enable flash attention")
numCtx := flag.Int("ctx-size", 2048, "Context (or KV cache) size")
lpath := flag.String("lora", "", "Path to lora layer file")
port := flag.Int("port", 8080, "Port to expose the server on")
threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
// TODO not yet implemented but wired to keep the parsing aligned
embedding := flag.Bool("embedding", false, "enable embedding vector output (default: disabled)")
logDisable := flag.Bool("log-disable", false, "disables logging to a file")
verbose := flag.Bool("verbose", false, "verbose output (default: disabled)")
f32 := flag.Bool("memory-f32", false, "use f32 instead of f16 for memory key+value (default: disabled) not recommended: doubles context memory required and no measurable increase in quality")
noMmap := flag.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
flag.Parse()
level := slog.LevelInfo
if *verbose {
level = slog.LevelDebug
}
handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
Level: level,
AddSource: true,
ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
if attr.Key == slog.SourceKey {
source := attr.Value.Any().(*slog.Source)
source.File = filepath.Base(source.File)
}
return attr
},
})
slog.SetDefault(slog.New(handler))
// TODO actually implement...
if *embedding {
slog.Warn("embeddings not yet support")
}
if *logDisable {
slog.Info("ignoring --log-disable")
}
if *f32 {
slog.Warn("memory-f32 not yet supported")
}
if *noMmap {
slog.Warn("no-mmap not yet supported")
}
if *mlock {
slog.Warn("mlock not yet supported")
}
if *tensorSplit != "" {
slog.Warn("tensor-split not yet implemented")
}
server := &Server{
numCtx: *numCtx,
@ -498,7 +546,7 @@ func main() {
// load the model
llama.BackendInit()
params := llama.NewModelParams(*nGpuLayers, *mainGpu, func(progress float32) {
slog.Info("Loading model", "progress %", math.Round(float64(progress*100)))
slog.Debug("Loading model", "progress %", math.Round(float64(progress*100)))
server.progress = progress
})
server.model = llama.LoadModelFromFile(*mpath, params)
@ -531,7 +579,7 @@ func main() {
defer listener.Close()
mux := http.NewServeMux()
mux.HandleFunc("/embeddings", server.embeddings)
mux.HandleFunc("/embedding", server.embeddings)
mux.HandleFunc("/completion", server.completion)
mux.HandleFunc("/health", server.health)
@ -539,7 +587,7 @@ func main() {
Handler: mux,
}
server.status = "ready"
server.status = "ok"
log.Println("Server listening on", addr)
if err := httpServer.Serve(listener); err != nil {

View File

@ -12,6 +12,7 @@ package llm
import "C"
import (
"fmt"
"log/slog"
"unsafe"
)
@ -37,3 +38,68 @@ func Quantize(infile, outfile string, ftype fileType) error {
return nil
}
type loadedModel struct {
model *C.struct_llama_model
}
func loadModel(modelfile string, vocabOnly bool) (*loadedModel, error) {
// TODO figure out how to quiet down the logging so we don't have 2 copies of the model metadata showing up
slog.Info("XXX initializing default model params")
params := C.llama_model_default_params()
params.vocab_only = C.bool(vocabOnly)
cmodelfile := C.CString(modelfile)
defer C.free(unsafe.Pointer(cmodelfile))
slog.Info("XXX loading model", "model", modelfile)
model := C.llama_load_model_from_file(cmodelfile, params)
if model == nil {
return nil, fmt.Errorf("failed to load model %s", modelfile)
}
return &loadedModel{model}, nil
}
func freeModel(model *loadedModel) {
C.llama_free_model(model.model)
}
func tokenize(model *loadedModel, content string) ([]int, error) {
ccontent := C.CString(content)
defer C.free(unsafe.Pointer(ccontent))
len := len(content) + 2
tokens := make([]C.int32_t, len)
tokenCount := C.llama_tokenize(model.model, ccontent, C.int32_t(len), &tokens[0], C.int32_t(len), true, true)
if tokenCount < 0 {
slog.Info("XXX got negative response", "count", tokenCount)
tokens = make([]C.int32_t, int(tokenCount))
tokenCount = C.llama_tokenize(model.model, ccontent, C.int32_t(len), &tokens[0], tokenCount, true, true)
} else if tokenCount == 0 {
return nil, nil
}
ret := make([]int, tokenCount)
for i := range int(tokenCount) {
ret[i] = int(tokens[i])
}
slog.Debug("XXX tokenized", "tokens", tokens, "content", content)
return ret, nil
}
func detokenize(model *loadedModel, tokens []int) string {
slog.Info("XXX in CGO detokenize")
var resp string
for _, token := range tokens {
buf := make([]C.char, 8)
nTokens := C.llama_token_to_piece(model.model, C.int(token), &buf[0], 8, 0, true)
if nTokens < 0 {
buf = make([]C.char, -nTokens)
nTokens = C.llama_token_to_piece(model.model, C.int(token), &buf[0], -nTokens, 0, true)
}
tokString := C.GoStringN(&buf[0], nTokens)
resp += tokString
}
slog.Debug("XXX detokenized", "tokens", tokens, "content", resp)
return resp
}

View File

@ -50,6 +50,7 @@ type llmServer struct {
status *StatusWriter
options api.Options
numParallel int
model *loadedModel
estimate MemoryEstimate
totalLayers uint64
@ -322,6 +323,9 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
server := filepath.Join(dir, "ollama_llama_server")
if envconfig.NewRunners() {
server = filepath.Join(dir, "ollama_runner")
}
if runtime.GOOS == "windows" {
server += ".exe"
}
@ -329,6 +333,9 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
// Detect tmp cleaners wiping out the file
_, err := os.Stat(server)
if errors.Is(err, os.ErrNotExist) {
if envconfig.NewRunners() {
return nil, fmt.Errorf("experimental runners enabled, but not present in this build")
}
slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err)
err = Init()
if err != nil {
@ -337,11 +344,16 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
}
m, err := loadModel(model, true)
if err != nil {
return nil, fmt.Errorf("unable to load model for tokenization %w", err)
}
s := &llmServer{
port: port,
cmd: exec.Command(server, finalParams...),
status: NewStatusWriter(os.Stderr),
options: opts,
model: m,
estimate: estimate,
numParallel: numParallel,
sem: semaphore.NewWeighted(int64(numParallel)),
@ -816,16 +828,16 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
continue
}
// slog.Debug("got line", "line", string(line))
evt, ok := bytes.CutPrefix(line, []byte("data: "))
if !ok {
return fmt.Errorf("error parsing llm response stream: %s", line)
evt = line
}
var c completion
if err := json.Unmarshal(evt, &c); err != nil {
return fmt.Errorf("error unmarshalling llm prediction response: %v", err)
}
switch {
case strings.TrimSpace(c.Content) == lastToken:
tokenRepeat++
@ -927,7 +939,7 @@ func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, err
}
if resp.StatusCode >= 400 {
log.Printf("llm encode error: %s", body)
log.Printf("llm embedding error: %s", body)
return nil, fmt.Errorf("%s", body)
}
@ -948,47 +960,7 @@ type TokenizeResponse struct {
}
func (s *llmServer) Tokenize(ctx context.Context, content string) ([]int, error) {
// Make sure the server is ready
status, err := s.getServerStatus(ctx)
if err != nil {
return nil, err
} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvailable {
return nil, fmt.Errorf("unexpected server status: %s", status.ToString())
}
data, err := json.Marshal(TokenizeRequest{Content: content})
if err != nil {
return nil, fmt.Errorf("marshaling encode data: %w", err)
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/tokenize", s.port), bytes.NewBuffer(data))
if err != nil {
return nil, fmt.Errorf("encode request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, fmt.Errorf("do encode request: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("read encode request: %w", err)
}
if resp.StatusCode >= 400 {
log.Printf("llm encode error: %s", body)
return nil, fmt.Errorf("%s", body)
}
var encoded TokenizeResponse
if err := json.Unmarshal(body, &encoded); err != nil {
return nil, fmt.Errorf("unmarshal encode response: %w", err)
}
return encoded.Tokens, nil
return tokenize(s.model, content)
}
type DetokenizeRequest struct {
@ -1000,50 +972,13 @@ type DetokenizeResponse struct {
}
func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
// Make sure the server is ready
status, err := s.getServerStatus(ctx)
if err != nil {
return "", err
} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvailable {
return "", fmt.Errorf("unexpected server status: %s", status.ToString())
}
data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
if err != nil {
return "", fmt.Errorf("marshaling decode data: %w", err)
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/detokenize", s.port), bytes.NewBuffer(data))
if err != nil {
return "", fmt.Errorf("decode request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return "", fmt.Errorf("do decode request: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("read decode request: %w", err)
}
if resp.StatusCode >= 400 {
log.Printf("llm decode error: %s", body)
return "", fmt.Errorf("%s", body)
}
var decoded DetokenizeResponse
if err := json.Unmarshal(body, &decoded); err != nil {
return "", fmt.Errorf("unmarshal encode response: %w", err)
}
return decoded.Content, nil
return detokenize(s.model, tokens), nil
}
func (s *llmServer) Close() error {
if s.model != nil {
freeModel(s.model)
}
if s.cmd != nil {
slog.Debug("stopping llama server")
if err := s.cmd.Process.Kill(); err != nil {