diff --git a/Dockerfile.build b/Dockerfile.build index 5499b0a1..6b7e3c4d 100644 --- a/Dockerfile.build +++ b/Dockerfile.build @@ -3,7 +3,7 @@ FROM --platform=linux/amd64 ubuntu:20.04 AS base-amd64 ARG CUDA_VERSION=11.3.1-1 ARG CMAKE_VERSION=3.22.1 # ROCm only supports amd64 -ARG ROCM_VERSION=5.7 +ARG ROCM_VERSION=6.0 # Note: https://rocm.docs.amd.com/en/latest/release/user_kernel_space_compat_matrix.html RUN apt-get update && \ diff --git a/gpu/gpu.go b/gpu/gpu.go index d03812c8..91ced3a8 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -65,7 +65,7 @@ func GetGPUInfo() GpuInfo { } var memInfo C.mem_info_t - resp := GpuInfo{"", 0, 0} + resp := GpuInfo{"", "", 0, 0} if gpuHandles.cuda != nil { C.cuda_check_vram(*gpuHandles.cuda, &memInfo) if memInfo.err != nil { @@ -73,6 +73,7 @@ func GetGPUInfo() GpuInfo { C.free(unsafe.Pointer(memInfo.err)) } else { resp.Driver = "CUDA" + resp.Library = "cuda_server" } } else if gpuHandles.rocm != nil { C.rocm_check_vram(*gpuHandles.rocm, &memInfo) @@ -81,11 +82,14 @@ func GetGPUInfo() GpuInfo { C.free(unsafe.Pointer(memInfo.err)) } else { resp.Driver = "ROCM" + resp.Library = "rocm_server" } } if resp.Driver == "" { C.cpu_check_ram(&memInfo) resp.Driver = "CPU" + // In the future we may offer multiple CPU variants to tune CPU features + resp.Library = "default" } if memInfo.err != nil { log.Printf("error looking up CPU memory: %s", C.GoString(memInfo.err)) diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index 14bd2655..ccf67b51 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -21,6 +21,7 @@ func GetGPUInfo() GpuInfo { return GpuInfo{ Driver: "METAL", + Library: "default", TotalMemory: 0, FreeMemory: 0, } diff --git a/gpu/types.go b/gpu/types.go index a84a0a8d..a56da45e 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -3,6 +3,7 @@ package gpu // Beginning of an `ollama info` command type GpuInfo struct { Driver string `json:"driver,omitempty"` + Library string `json:"library,omitempty"` TotalMemory uint64 `json:"total_memory,omitempty"` FreeMemory uint64 `json:"free_memory,omitempty"` diff --git a/llm/rocm_shim.c b/llm/dynamic_shim.c similarity index 55% rename from llm/rocm_shim.c rename to llm/dynamic_shim.c index e8304aa0..8b5d67c9 100644 --- a/llm/rocm_shim.c +++ b/llm/dynamic_shim.c @@ -1,4 +1,4 @@ -#include "rocm_shim.h" +#include "dynamic_shim.h" #include #include @@ -28,8 +28,8 @@ inline static char *LOAD_ERR() { #define UNLOAD_LIBRARY(handle) dlclose(handle) #endif -void rocm_shim_init(const char *libPath, struct rocm_llama_server *s, - ext_server_resp_t *err) { +void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s, + ext_server_resp_t *err) { int i = 0; struct lookup { char *s; @@ -57,11 +57,8 @@ void rocm_shim_init(const char *libPath, struct rocm_llama_server *s, s->handle = LOAD_LIBRARY(libPath, RTLD_NOW); if (!s->handle) { err->id = -1; - snprintf( - err->msg, err->msg_len, - "Unable to load rocm server library: %s (If you have a Radeon card, " - "did you install the ROCM libraries?)", - LOAD_ERR()); + snprintf(err->msg, err->msg_len, + "Unable to load dynamic server library: %s", LOAD_ERR()); return; } @@ -77,64 +74,63 @@ void rocm_shim_init(const char *libPath, struct rocm_llama_server *s, } } -inline void rocm_shim_llama_server_init(struct rocm_llama_server s, - ext_server_params_t *sparams, - ext_server_resp_t *err) { +inline void dynamic_shim_llama_server_init(struct dynamic_llama_server s, + ext_server_params_t *sparams, + ext_server_resp_t *err) { s.llama_server_init(sparams, err); } -inline void rocm_shim_llama_server_start(struct rocm_llama_server s) { +inline void dynamic_shim_llama_server_start(struct dynamic_llama_server s) { s.llama_server_start(); } -inline void rocm_shim_llama_server_stop(struct rocm_llama_server s) { +inline void dynamic_shim_llama_server_stop(struct dynamic_llama_server s) { s.llama_server_stop(); } -inline void rocm_shim_llama_server_completion(struct rocm_llama_server s, - const char *json_req, - ext_server_resp_t *resp) { +inline void dynamic_shim_llama_server_completion(struct dynamic_llama_server s, + const char *json_req, + ext_server_resp_t *resp) { s.llama_server_completion(json_req, resp); } -inline void rocm_shim_llama_server_completion_next_result( - struct rocm_llama_server s, const int task_id, +inline void dynamic_shim_llama_server_completion_next_result( + struct dynamic_llama_server s, const int task_id, ext_server_task_result_t *result) { s.llama_server_completion_next_result(task_id, result); } -inline void rocm_shim_llama_server_completion_cancel(struct rocm_llama_server s, - const int task_id, - ext_server_resp_t *err) { +inline void dynamic_shim_llama_server_completion_cancel( + struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) { s.llama_server_completion_cancel(task_id, err); } -inline void rocm_shim_llama_server_release_task_result( - struct rocm_llama_server s, ext_server_task_result_t *result) { +inline void dynamic_shim_llama_server_release_task_result( + struct dynamic_llama_server s, ext_server_task_result_t *result) { s.llama_server_release_task_result(result); } -inline void rocm_shim_llama_server_tokenize(struct rocm_llama_server s, - const char *json_req, - char **json_resp, - ext_server_resp_t *err) { +inline void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s, + const char *json_req, + char **json_resp, + ext_server_resp_t *err) { s.llama_server_tokenize(json_req, json_resp, err); } -inline void rocm_shim_llama_server_detokenize(struct rocm_llama_server s, - const char *json_req, - char **json_resp, - ext_server_resp_t *err) { +inline void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s, + const char *json_req, + char **json_resp, + ext_server_resp_t *err) { s.llama_server_detokenize(json_req, json_resp, err); } -inline void rocm_shim_llama_server_embedding(struct rocm_llama_server s, - const char *json_req, - char **json_resp, - ext_server_resp_t *err) { +inline void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s, + const char *json_req, + char **json_resp, + ext_server_resp_t *err) { s.llama_server_embedding(json_req, json_resp, err); } -inline void rocm_shim_llama_server_release_json_resp(struct rocm_llama_server s, - char **json_resp) { +inline void dynamic_shim_llama_server_release_json_resp( + struct dynamic_llama_server s, char **json_resp) { s.llama_server_release_json_resp(json_resp); } diff --git a/llm/dynamic_shim.h b/llm/dynamic_shim.h new file mode 100644 index 00000000..5e4e78b7 --- /dev/null +++ b/llm/dynamic_shim.h @@ -0,0 +1,74 @@ +#include + +#include "server.h" + +#ifdef __cplusplus +extern "C" { +#endif +struct dynamic_llama_server { + void *handle; + void (*llama_server_init)(ext_server_params_t *sparams, + ext_server_resp_t *err); + void (*llama_server_start)(); + void (*llama_server_stop)(); + void (*llama_server_completion)(const char *json_req, + ext_server_resp_t *resp); + void (*llama_server_completion_next_result)(const int task_id, + ext_server_task_result_t *result); + void (*llama_server_completion_cancel)(const int task_id, + ext_server_resp_t *err); + void (*llama_server_release_task_result)(ext_server_task_result_t *result); + void (*llama_server_tokenize)(const char *json_req, char **json_resp, + ext_server_resp_t *err); + void (*llama_server_detokenize)(const char *json_req, char **json_resp, + ext_server_resp_t *err); + void (*llama_server_embedding)(const char *json_req, char **json_resp, + ext_server_resp_t *err); + void (*llama_server_release_json_resp)(char **json_resp); +}; + +void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s, + ext_server_resp_t *err); + +// No good way to call C function pointers from Go so inline the indirection +void dynamic_shim_llama_server_init(struct dynamic_llama_server s, + ext_server_params_t *sparams, + ext_server_resp_t *err); + +void dynamic_shim_llama_server_start(struct dynamic_llama_server s); + +void dynamic_shim_llama_server_stop(struct dynamic_llama_server s); + +void dynamic_shim_llama_server_completion(struct dynamic_llama_server s, + const char *json_req, + ext_server_resp_t *resp); + +void dynamic_shim_llama_server_completion_next_result( + struct dynamic_llama_server s, const int task_id, + ext_server_task_result_t *result); + +void dynamic_shim_llama_server_completion_cancel(struct dynamic_llama_server s, + const int task_id, + ext_server_resp_t *err); + +void dynamic_shim_llama_server_release_task_result( + struct dynamic_llama_server s, ext_server_task_result_t *result); + +void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s, + const char *json_req, char **json_resp, + ext_server_resp_t *err); + +void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s, + const char *json_req, + char **json_resp, + ext_server_resp_t *err); + +void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s, + const char *json_req, char **json_resp, + ext_server_resp_t *err); +void dynamic_shim_llama_server_release_json_resp(struct dynamic_llama_server s, + char **json_resp); + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/llm/ext_server.go b/llm/ext_server.go index ab74eb00..048b1a65 100644 --- a/llm/ext_server.go +++ b/llm/ext_server.go @@ -17,7 +17,10 @@ package llm #cgo linux CFLAGS: -D_GNU_SOURCE #cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS #cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs -#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libollama.a +#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/examples/server/libext_server.a +#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/common/libcommon.a +#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libllama.a +#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libggml_static.a #cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm #cgo windows LDFLAGS: -L${SRCDIR}/llama.cpp/gguf/build/wincuda/dist/bin #cgo windows LDFLAGS: -lext_server_shared -lpthread @@ -121,7 +124,7 @@ func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) { C.llama_server_release_json_resp(json_resp) } -func newLlamaExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { +func newDefaultExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { server := &llamaExtServer{opts} return newExtServer(server, model, adapters, projectors, numLayers, opts) } diff --git a/llm/llama.cpp/gen_common.sh b/llm/llama.cpp/gen_common.sh index 83a21cf9..c6b84f7d 100644 --- a/llm/llama.cpp/gen_common.sh +++ b/llm/llama.cpp/gen_common.sh @@ -6,7 +6,7 @@ init_vars() { CMAKE_DEFS="-DLLAMA_ACCELERATE=on" # TODO - LLAMA_K_QUANTS is stale and needs to be mapped to newer cmake settings CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static" - if echo "${CGO_CFLAGS}" | grep -- '-g' > /dev/null ; then + if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on ${CMAKE_DEFS}" else # TODO - add additional optimization flags... @@ -15,7 +15,7 @@ init_vars() { } git_module_setup() { - if [ -n "${OLLAMA_SKIP_PATCHING}" ] ; then + if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then echo "Skipping submodule initialization" return fi @@ -25,13 +25,13 @@ git_module_setup() { } apply_patches() { - if [ -n "${OLLAMA_SKIP_PATCHING}" ] ; then + if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then echo "Skipping submodule patching" return fi # Workaround git apply not handling creation well for iteration rm -f gguf/examples/server/server.h - for patch in ${PATCHES} ; do + for patch in ${PATCHES}; do git -C gguf apply ../patches/${patch} done } @@ -39,4 +39,4 @@ apply_patches() { build() { cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS} cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8 -} \ No newline at end of file +} diff --git a/llm/llama.cpp/gen_linux.sh b/llm/llama.cpp/gen_linux.sh index 3608ddd6..e3cb87a8 100755 --- a/llm/llama.cpp/gen_linux.sh +++ b/llm/llama.cpp/gen_linux.sh @@ -1,81 +1,81 @@ #!/bin/bash # This script is intended to run inside the go generate -# working directory must be ../llm/llama.cpp +# working directory must be llm/llama.cpp + +# First we build our default built-in library which will be linked into the CGO +# binary as a normal dependency. This default build is CPU based. +# +# Then we build a CUDA dynamic library (although statically linked with the CUDA +# library dependencies for maximum portability) +# +# Then if we detect ROCm, we build a dynamically loaded ROCm lib. ROCm is particularly +# important to be a dynamic lib even if it's the only GPU library detected because +# we can't redistribute the objectfiles but must rely on dynamic libraries at +# runtime, which could lead the server not to start if not present. set -ex set -o pipefail echo "Starting linux generate script" -if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ] ; then +if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ]; then export CUDACXX=/usr/local/cuda/bin/nvcc fi +COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_ACCELERATE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" +OLLAMA_DYN_LIB_DIR="gguf/build/lib" +mkdir -p ${OLLAMA_DYN_LIB_DIR} +touch ${OLLAMA_DYN_LIB_DIR}/.generated source $(dirname $0)/gen_common.sh init_vars git_module_setup apply_patches -if [ -d /usr/local/cuda/lib64/ ] ; then - CMAKE_DEFS="-DLLAMA_CUBLAS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" -else - CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" -fi -BUILD_DIR="gguf/build/cuda" -LIB_DIR="${BUILD_DIR}/lib" -mkdir -p ../../dist/ + +# +# CPU first for the default library +# +CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}" +BUILD_DIR="gguf/build/cpu" build -if [ -d /usr/local/cuda/lib64/ ] ; then - pwd - ar -M < - -#include "server.h" - -#ifdef __cplusplus -extern "C" { -#endif -struct rocm_llama_server { - void *handle; - void (*llama_server_init)(ext_server_params_t *sparams, - ext_server_resp_t *err); - void (*llama_server_start)(); - void (*llama_server_stop)(); - void (*llama_server_completion)(const char *json_req, - ext_server_resp_t *resp); - void (*llama_server_completion_next_result)(const int task_id, - ext_server_task_result_t *result); - void (*llama_server_completion_cancel)(const int task_id, - ext_server_resp_t *err); - void (*llama_server_release_task_result)(ext_server_task_result_t *result); - void (*llama_server_tokenize)(const char *json_req, char **json_resp, - ext_server_resp_t *err); - void (*llama_server_detokenize)(const char *json_req, char **json_resp, - ext_server_resp_t *err); - void (*llama_server_embedding)(const char *json_req, char **json_resp, - ext_server_resp_t *err); - void (*llama_server_release_json_resp)(char **json_resp); -}; - -void rocm_shim_init(const char *libPath, struct rocm_llama_server *s, - ext_server_resp_t *err); - -// No good way to call C function pointers from Go so inline the indirection -void rocm_shim_llama_server_init(struct rocm_llama_server s, - ext_server_params_t *sparams, - ext_server_resp_t *err); - -void rocm_shim_llama_server_start(struct rocm_llama_server s); - -void rocm_shim_llama_server_stop(struct rocm_llama_server s); - -void rocm_shim_llama_server_completion(struct rocm_llama_server s, - const char *json_req, - ext_server_resp_t *resp); - -void rocm_shim_llama_server_completion_next_result( - struct rocm_llama_server s, const int task_id, - ext_server_task_result_t *result); - -void rocm_shim_llama_server_completion_cancel(struct rocm_llama_server s, - const int task_id, - ext_server_resp_t *err); - -void rocm_shim_llama_server_release_task_result( - struct rocm_llama_server s, ext_server_task_result_t *result); - -void rocm_shim_llama_server_tokenize(struct rocm_llama_server s, - const char *json_req, char **json_resp, - ext_server_resp_t *err); - -void rocm_shim_llama_server_detokenize(struct rocm_llama_server s, - const char *json_req, char **json_resp, - ext_server_resp_t *err); - -void rocm_shim_llama_server_embedding(struct rocm_llama_server s, - const char *json_req, char **json_resp, - ext_server_resp_t *err); -void rocm_shim_llama_server_release_json_resp(struct rocm_llama_server s, - char **json_resp); - -#ifdef __cplusplus -} -#endif \ No newline at end of file diff --git a/llm/shim_darwin.go b/llm/shim_darwin.go index f63ce8c8..98e7a7d5 100644 --- a/llm/shim_darwin.go +++ b/llm/shim_darwin.go @@ -12,13 +12,13 @@ import ( //go:embed llama.cpp/gguf/ggml-metal.metal var libEmbed embed.FS -func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { +func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { // should never happen... - return nil, fmt.Errorf("ROCM GPUs not supported on Mac") + return nil, fmt.Errorf("Dynamic library loading not supported on Mac") } func nativeInit(workdir string) error { - err := extractLib(workdir, "llama.cpp/gguf/ggml-metal.metal") + _, err := extractDynamicLibs(workdir, "llama.cpp/gguf/ggml-metal.metal") if err != nil { if err == payloadMissing { // TODO perhaps consider this a hard failure on arm macs? diff --git a/llm/shim_ext_server.go b/llm/shim_ext_server.go index fa841d49..d9c2df46 100644 --- a/llm/shim_ext_server.go +++ b/llm/shim_ext_server.go @@ -5,7 +5,7 @@ package llm /* #include -#include "rocm_shim.h" +#include "dynamic_shim.h" */ import "C" @@ -18,20 +18,20 @@ import ( "log" "os" "path/filepath" - "runtime" + "strings" "sync" "unsafe" "github.com/jmorganca/ollama/api" ) -//go:embed llama.cpp/gguf/build/*/lib/* +//go:embed llama.cpp/gguf/build/lib/* var libEmbed embed.FS var RocmShimMissing = fmt.Errorf("ROCm shim library not included in this build of ollama. Radeon GPUs are not supported") type shimExtServer struct { - s C.struct_rocm_llama_server + s C.struct_dynamic_llama_server options api.Options } @@ -40,50 +40,58 @@ var shimMutex sync.Mutex var llm *shimExtServer func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) { - C.rocm_shim_llama_server_init(llm.s, sparams, err) + C.dynamic_shim_llama_server_init(llm.s, sparams, err) } func (llm *shimExtServer) llama_server_start() { - C.rocm_shim_llama_server_start(llm.s) + C.dynamic_shim_llama_server_start(llm.s) } func (llm *shimExtServer) llama_server_stop() { - C.rocm_shim_llama_server_stop(llm.s) + C.dynamic_shim_llama_server_stop(llm.s) } func (llm *shimExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) { - C.rocm_shim_llama_server_completion(llm.s, json_req, resp) + C.dynamic_shim_llama_server_completion(llm.s, json_req, resp) } func (llm *shimExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) { - C.rocm_shim_llama_server_completion_next_result(llm.s, task_id, resp) + C.dynamic_shim_llama_server_completion_next_result(llm.s, task_id, resp) } func (llm *shimExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) { - C.rocm_shim_llama_server_completion_cancel(llm.s, task_id, err) + C.dynamic_shim_llama_server_completion_cancel(llm.s, task_id, err) } func (llm *shimExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) { - C.rocm_shim_llama_server_release_task_result(llm.s, result) + C.dynamic_shim_llama_server_release_task_result(llm.s, result) } func (llm *shimExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { - C.rocm_shim_llama_server_tokenize(llm.s, json_req, json_resp, err) + C.dynamic_shim_llama_server_tokenize(llm.s, json_req, json_resp, err) } func (llm *shimExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { - C.rocm_shim_llama_server_detokenize(llm.s, json_req, json_resp, err) + C.dynamic_shim_llama_server_detokenize(llm.s, json_req, json_resp, err) } func (llm *shimExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { - C.rocm_shim_llama_server_embedding(llm.s, json_req, json_resp, err) + C.dynamic_shim_llama_server_embedding(llm.s, json_req, json_resp, err) } func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) { - C.rocm_shim_llama_server_release_json_resp(llm.s, json_resp) + C.dynamic_shim_llama_server_release_json_resp(llm.s, json_resp) } -func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { - if !ShimPresent { - return nil, RocmShimMissing +func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { + shimMutex.Lock() + defer shimMutex.Unlock() + libPath := C.CString(library) + defer C.free(unsafe.Pointer(libPath)) + resp := newExtServerResp(128) + defer freeExtServerResp(resp) + var srv C.struct_dynamic_llama_server + C.dynamic_shim_init(libPath, &srv, &resp) + if resp.id < 0 { + return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg)) } - log.Printf("Loading ROCM llm server") - if llm == nil { - return nil, fmt.Errorf("nativeInit wasnt called or libary load failed") + llm = &shimExtServer{ + s: srv, + options: opts, } - llm.options = opts + log.Printf("Loading Dynamic Shim llm server: %s", library) return newExtServer(llm, model, adapters, projectors, numLayers, opts) } @@ -108,64 +116,37 @@ func (llm *shimExtServer) Close() { } func nativeInit(workdir string) error { - err := extractLib(workdir, "llama.cpp/gguf/build/*/lib/*rocm_server*") + libs, err := extractDynamicLibs(workdir, "llama.cpp/gguf/build/lib/*server*") if err != nil { if err == payloadMissing { - log.Printf("%s", RocmShimMissing) + log.Printf("%s", payloadMissing) return nil } return err - } else { - ShimPresent = true + } + for _, lib := range libs { + libName := strings.Split(strings.TrimPrefix(filepath.Base(lib), "lib"), ".")[0] + AvailableShims[libName] = lib } - // Verify we have permissions - either running as root, or we have group access to the driver - fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666) - if err != nil { - if errors.Is(err, fs.ErrPermission) { - log.Fatalf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.") - return err - } else if errors.Is(err, fs.ErrNotExist) { - // expected behavior without a radeon card - return nil + // Only check ROCm access if we have the dynamic lib loaded + if _, rocmPresent := AvailableShims["rocm_server"]; rocmPresent { + // Verify we have permissions - either running as root, or we have group access to the driver + fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666) + if err != nil { + if errors.Is(err, fs.ErrPermission) { + log.Fatalf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.") + return err + } else if errors.Is(err, fs.ErrNotExist) { + // expected behavior without a radeon card + return nil + } + + return fmt.Errorf("failed to check permission on /dev/kfd: %w", err) } + fd.Close() - return fmt.Errorf("failed to check permission on /dev/kfd: %w", err) } - fd.Close() - shimMutex.Lock() - defer shimMutex.Unlock() - if llm != nil { - return nil - } - var libName string - switch runtime.GOOS { - case "darwin": - // shouldn't happen - return nil - case "linux": - libName = "librocm_server.so" - case "windows": - libName = "rocm_server.dll" - default: - // shouldn't happen - return nil - } - libPath := C.CString(filepath.Join(workdir, libName)) - defer C.free(unsafe.Pointer(libPath)) - resp := newExtServerResp(128) - defer freeExtServerResp(resp) - var srv C.struct_rocm_llama_server - C.rocm_shim_init(libPath, &srv, &resp) - if resp.id < 0 { - // TODO - consider softening this failure mode to allow fall-back to the CUDA based built-in llm - // and run against CPU - return fmt.Errorf("Unable to load AMD GPU library: %s", C.GoString(resp.msg)) - } - llm = &shimExtServer{ - s: srv, - options: api.DefaultOptions(), - } return nil }