diff --git a/app/ollama.iss b/app/ollama.iss index 7362eeeb..d5940be6 100644 --- a/app/ollama.iss +++ b/app/ollama.iss @@ -136,7 +136,7 @@ Type: filesandordirs; Name: "{%TEMP}\ollama*" Type: filesandordirs; Name: "{%LOCALAPPDATA}\Programs\Ollama" [Messages] -WizardReady=Ollama Windows Preview +WizardReady=Ollama ReadyLabel1=%nLet's get you up and running with your own large language models. SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or finish the other installer, then click OK to continue with this install, or Cancel to exit. diff --git a/discover/gpu_info_nvcuda.c b/discover/gpu_info_nvcuda.c index a1a38bfc..466e1ac2 100644 --- a/discover/gpu_info_nvcuda.c +++ b/discover/gpu_info_nvcuda.c @@ -4,6 +4,7 @@ #include "gpu_info_nvcuda.h" void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { + LOG(resp->ch.verbose, "initializing %s\n", nvcuda_lib_path); CUresult ret; resp->err = NULL; resp->num_devices = 0; @@ -57,8 +58,10 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { resp->cudaErr = -1; return; } + LOG(resp->ch.verbose, "dlsym: %s - %p\n", l[i].s, *l[i].p); } + LOG(resp->ch.verbose, "calling cuInit\n"); ret = (*resp->ch.cuInit)(0); if (ret != CUDA_SUCCESS) { LOG(resp->ch.verbose, "cuInit err: %d\n", ret); @@ -75,15 +78,18 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { resp->ch.driver_minor = 0; // Report driver version if we're in verbose mode, ignore errors + LOG(resp->ch.verbose, "calling cuDriverGetVersion\n"); ret = (*resp->ch.cuDriverGetVersion)(&version); if (ret != CUDA_SUCCESS) { LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret); } else { + LOG(resp->ch.verbose, "raw version 0x%x\n", version); resp->ch.driver_major = version / 1000; resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10; LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor); } + LOG(resp->ch.verbose, "calling cuDeviceGetCount\n"); ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices); if (ret != CUDA_SUCCESS) { LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret); @@ -94,6 +100,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { resp->cudaErr = ret; return; } + LOG(resp->ch.verbose, "device count %d\n", resp->num_devices); } const int buflen = 256; diff --git a/docs/development.md b/docs/development.md index f2039a08..13457ae3 100644 --- a/docs/development.md +++ b/docs/development.md @@ -108,7 +108,7 @@ Custom CPU settings are not currently supported in the new Go server build but w #### Containerized Linux Build -If you have Docker available, you can build linux binaries with `OLLAMA_NEW_RUNNERS=1 ./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist` +If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist` ### Windows diff --git a/llama/make/Makefile.rocm b/llama/make/Makefile.rocm index 947c43a6..4ab176b4 100644 --- a/llama/make/Makefile.rocm +++ b/llama/make/Makefile.rocm @@ -58,6 +58,8 @@ endif GPU_COMPILER_CUFLAGS = \ $(GPU_COMPILER_FPIC) \ $(addprefix -m,$(GPU_RUNNER_CPU_FLAGS)) \ + -mf16c \ + -mfma \ -parallel-jobs=2 \ -c \ -O3 \ @@ -77,6 +79,9 @@ GPU_COMPILER_CUFLAGS = \ -D_CRT_SECURE_NO_WARNINGS \ -D_GNU_SOURCE \ -D_XOPEN_SOURCE=600 \ + -DUSE_PROF_API=1 \ + -std=gnu++14 \ + -x hip \ -mllvm=-amdgpu-early-inline-all=true \ -mllvm=-amdgpu-function-calls=false \ -Wno-expansion-to-defined \ @@ -87,6 +92,12 @@ GPU_COMPILER_CUFLAGS = \ -Wno-unused-result \ -I. +# Workaround buggy P2P copy on some windows multi-GPU setups +# This workaround breaks linux systems with small system RAM, so only enable on windows +ifeq ($(OS),windows) + GPU_COMPILER_CUFLAGS += -DGGML_CUDA_NO_PEER_COPY=1 +endif + include make/gpu.make # Adjust the rules from gpu.make to handle the ROCm dependencies properly diff --git a/llama/make/gpu.make b/llama/make/gpu.make index 939fa41a..fbd8dbca 100644 --- a/llama/make/gpu.make +++ b/llama/make/gpu.make @@ -85,7 +85,7 @@ $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(RUNNERS GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(DIST_GPU_RUNNER_LIB_DEPS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS) @-mkdir -p $(dir $@) - $(CCACHE) $(GPU_COMPILER) --shared $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@ + $(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@ # Distribution targets $(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%