diff --git a/Dockerfile.build b/Dockerfile.build
index 5499b0a1..6b7e3c4d 100644
--- a/Dockerfile.build
+++ b/Dockerfile.build
@@ -3,7 +3,7 @@ FROM --platform=linux/amd64 ubuntu:20.04 AS base-amd64
 ARG CUDA_VERSION=11.3.1-1
 ARG CMAKE_VERSION=3.22.1
 # ROCm only supports amd64
-ARG ROCM_VERSION=5.7
+ARG ROCM_VERSION=6.0
 
 # Note: https://rocm.docs.amd.com/en/latest/release/user_kernel_space_compat_matrix.html
 RUN apt-get update && \
diff --git a/gpu/gpu.go b/gpu/gpu.go
index d03812c8..91ced3a8 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -65,7 +65,7 @@ func GetGPUInfo() GpuInfo {
 	}
 
 	var memInfo C.mem_info_t
-	resp := GpuInfo{"", 0, 0}
+	resp := GpuInfo{"", "", 0, 0}
 	if gpuHandles.cuda != nil {
 		C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
 		if memInfo.err != nil {
@@ -73,6 +73,7 @@ func GetGPUInfo() GpuInfo {
 			C.free(unsafe.Pointer(memInfo.err))
 		} else {
 			resp.Driver = "CUDA"
+			resp.Library = "cuda_server"
 		}
 	} else if gpuHandles.rocm != nil {
 		C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
@@ -81,11 +82,14 @@ func GetGPUInfo() GpuInfo {
 			C.free(unsafe.Pointer(memInfo.err))
 		} else {
 			resp.Driver = "ROCM"
+			resp.Library = "rocm_server"
 		}
 	}
 	if resp.Driver == "" {
 		C.cpu_check_ram(&memInfo)
 		resp.Driver = "CPU"
+		// In the future we may offer multiple CPU variants to tune CPU features
+		resp.Library = "default"
 	}
 	if memInfo.err != nil {
 		log.Printf("error looking up CPU memory: %s", C.GoString(memInfo.err))
diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go
index 14bd2655..ccf67b51 100644
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -21,6 +21,7 @@ func GetGPUInfo() GpuInfo {
 
 	return GpuInfo{
 		Driver:      "METAL",
+		Library:     "default",
 		TotalMemory: 0,
 		FreeMemory:  0,
 	}
diff --git a/gpu/types.go b/gpu/types.go
index a84a0a8d..a56da45e 100644
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -3,6 +3,7 @@ package gpu
 // Beginning of an `ollama info` command
 type GpuInfo struct {
 	Driver      string `json:"driver,omitempty"`
+	Library     string `json:"library,omitempty"`
 	TotalMemory uint64 `json:"total_memory,omitempty"`
 	FreeMemory  uint64 `json:"free_memory,omitempty"`
 
diff --git a/llm/rocm_shim.c b/llm/dynamic_shim.c
similarity index 55%
rename from llm/rocm_shim.c
rename to llm/dynamic_shim.c
index e8304aa0..8b5d67c9 100644
--- a/llm/rocm_shim.c
+++ b/llm/dynamic_shim.c
@@ -1,4 +1,4 @@
-#include "rocm_shim.h"
+#include "dynamic_shim.h"
 
 #include <stdio.h>
 #include <string.h>
@@ -28,8 +28,8 @@ inline static char *LOAD_ERR() {
 #define UNLOAD_LIBRARY(handle) dlclose(handle)
 #endif
 
-void rocm_shim_init(const char *libPath, struct rocm_llama_server *s,
-                    ext_server_resp_t *err) {
+void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
+                       ext_server_resp_t *err) {
   int i = 0;
   struct lookup {
     char *s;
@@ -57,11 +57,8 @@ void rocm_shim_init(const char *libPath, struct rocm_llama_server *s,
   s->handle = LOAD_LIBRARY(libPath, RTLD_NOW);
   if (!s->handle) {
     err->id = -1;
-    snprintf(
-        err->msg, err->msg_len,
-        "Unable to load rocm server library: %s (If you have a Radeon card, "
-        "did you install the ROCM libraries?)",
-        LOAD_ERR());
+    snprintf(err->msg, err->msg_len,
+             "Unable to load dynamic server library: %s", LOAD_ERR());
     return;
   }
 
@@ -77,64 +74,63 @@ void rocm_shim_init(const char *libPath, struct rocm_llama_server *s,
   }
 }
 
-inline void rocm_shim_llama_server_init(struct rocm_llama_server s,
-                                        ext_server_params_t *sparams,
-                                        ext_server_resp_t *err) {
+inline void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
+                                           ext_server_params_t *sparams,
+                                           ext_server_resp_t *err) {
   s.llama_server_init(sparams, err);
 }
 
-inline void rocm_shim_llama_server_start(struct rocm_llama_server s) {
+inline void dynamic_shim_llama_server_start(struct dynamic_llama_server s) {
   s.llama_server_start();
 }
 
-inline void rocm_shim_llama_server_stop(struct rocm_llama_server s) {
+inline void dynamic_shim_llama_server_stop(struct dynamic_llama_server s) {
   s.llama_server_stop();
 }
 
-inline void rocm_shim_llama_server_completion(struct rocm_llama_server s,
-                                              const char *json_req,
-                                              ext_server_resp_t *resp) {
+inline void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
+                                                 const char *json_req,
+                                                 ext_server_resp_t *resp) {
   s.llama_server_completion(json_req, resp);
 }
 
-inline void rocm_shim_llama_server_completion_next_result(
-    struct rocm_llama_server s, const int task_id,
+inline void dynamic_shim_llama_server_completion_next_result(
+    struct dynamic_llama_server s, const int task_id,
     ext_server_task_result_t *result) {
   s.llama_server_completion_next_result(task_id, result);
 }
 
-inline void rocm_shim_llama_server_completion_cancel(struct rocm_llama_server s,
-                                                     const int task_id,
-                                                     ext_server_resp_t *err) {
+inline void dynamic_shim_llama_server_completion_cancel(
+    struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
   s.llama_server_completion_cancel(task_id, err);
 }
-inline void rocm_shim_llama_server_release_task_result(
-    struct rocm_llama_server s, ext_server_task_result_t *result) {
+inline void dynamic_shim_llama_server_release_task_result(
+    struct dynamic_llama_server s, ext_server_task_result_t *result) {
   s.llama_server_release_task_result(result);
 }
 
-inline void rocm_shim_llama_server_tokenize(struct rocm_llama_server s,
-                                            const char *json_req,
-                                            char **json_resp,
-                                            ext_server_resp_t *err) {
+inline void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
+                                               const char *json_req,
+                                               char **json_resp,
+                                               ext_server_resp_t *err) {
   s.llama_server_tokenize(json_req, json_resp, err);
 }
 
-inline void rocm_shim_llama_server_detokenize(struct rocm_llama_server s,
-                                              const char *json_req,
-                                              char **json_resp,
-                                              ext_server_resp_t *err) {
+inline void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
+                                                 const char *json_req,
+                                                 char **json_resp,
+                                                 ext_server_resp_t *err) {
   s.llama_server_detokenize(json_req, json_resp, err);
 }
 
-inline void rocm_shim_llama_server_embedding(struct rocm_llama_server s,
-                                             const char *json_req,
-                                             char **json_resp,
-                                             ext_server_resp_t *err) {
+inline void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
+                                                const char *json_req,
+                                                char **json_resp,
+                                                ext_server_resp_t *err) {
   s.llama_server_embedding(json_req, json_resp, err);
 }
 
-inline void rocm_shim_llama_server_release_json_resp(struct rocm_llama_server s,
-                                                     char **json_resp) {
+inline void dynamic_shim_llama_server_release_json_resp(
+    struct dynamic_llama_server s, char **json_resp) {
   s.llama_server_release_json_resp(json_resp);
 }
diff --git a/llm/dynamic_shim.h b/llm/dynamic_shim.h
new file mode 100644
index 00000000..5e4e78b7
--- /dev/null
+++ b/llm/dynamic_shim.h
@@ -0,0 +1,74 @@
+#include <stdlib.h>
+
+#include "server.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+struct dynamic_llama_server {
+  void *handle;
+  void (*llama_server_init)(ext_server_params_t *sparams,
+                            ext_server_resp_t *err);
+  void (*llama_server_start)();
+  void (*llama_server_stop)();
+  void (*llama_server_completion)(const char *json_req,
+                                  ext_server_resp_t *resp);
+  void (*llama_server_completion_next_result)(const int task_id,
+                                              ext_server_task_result_t *result);
+  void (*llama_server_completion_cancel)(const int task_id,
+                                         ext_server_resp_t *err);
+  void (*llama_server_release_task_result)(ext_server_task_result_t *result);
+  void (*llama_server_tokenize)(const char *json_req, char **json_resp,
+                                ext_server_resp_t *err);
+  void (*llama_server_detokenize)(const char *json_req, char **json_resp,
+                                  ext_server_resp_t *err);
+  void (*llama_server_embedding)(const char *json_req, char **json_resp,
+                                 ext_server_resp_t *err);
+  void (*llama_server_release_json_resp)(char **json_resp);
+};
+
+void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
+                       ext_server_resp_t *err);
+
+// No good way to call C function pointers from Go so inline the indirection
+void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
+                                    ext_server_params_t *sparams,
+                                    ext_server_resp_t *err);
+
+void dynamic_shim_llama_server_start(struct dynamic_llama_server s);
+
+void dynamic_shim_llama_server_stop(struct dynamic_llama_server s);
+
+void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
+                                          const char *json_req,
+                                          ext_server_resp_t *resp);
+
+void dynamic_shim_llama_server_completion_next_result(
+    struct dynamic_llama_server s, const int task_id,
+    ext_server_task_result_t *result);
+
+void dynamic_shim_llama_server_completion_cancel(struct dynamic_llama_server s,
+                                                 const int task_id,
+                                                 ext_server_resp_t *err);
+
+void dynamic_shim_llama_server_release_task_result(
+    struct dynamic_llama_server s, ext_server_task_result_t *result);
+
+void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
+                                        const char *json_req, char **json_resp,
+                                        ext_server_resp_t *err);
+
+void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
+                                          const char *json_req,
+                                          char **json_resp,
+                                          ext_server_resp_t *err);
+
+void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
+                                         const char *json_req, char **json_resp,
+                                         ext_server_resp_t *err);
+void dynamic_shim_llama_server_release_json_resp(struct dynamic_llama_server s,
+                                                 char **json_resp);
+
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/llm/ext_server.go b/llm/ext_server.go
index ab74eb00..048b1a65 100644
--- a/llm/ext_server.go
+++ b/llm/ext_server.go
@@ -17,7 +17,10 @@ package llm
 #cgo linux CFLAGS: -D_GNU_SOURCE
 #cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
 #cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
-#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libollama.a
+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/examples/server/libext_server.a
+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/common/libcommon.a
+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libllama.a
+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libggml_static.a
 #cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
 #cgo windows LDFLAGS: -L${SRCDIR}/llama.cpp/gguf/build/wincuda/dist/bin
 #cgo windows LDFLAGS: -lext_server_shared -lpthread
@@ -121,7 +124,7 @@ func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) {
 	C.llama_server_release_json_resp(json_resp)
 }
 
-func newLlamaExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
+func newDefaultExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
 	server := &llamaExtServer{opts}
 	return newExtServer(server, model, adapters, projectors, numLayers, opts)
 }
diff --git a/llm/llama.cpp/gen_common.sh b/llm/llama.cpp/gen_common.sh
index 83a21cf9..c6b84f7d 100644
--- a/llm/llama.cpp/gen_common.sh
+++ b/llm/llama.cpp/gen_common.sh
@@ -6,7 +6,7 @@ init_vars() {
     CMAKE_DEFS="-DLLAMA_ACCELERATE=on"
     # TODO - LLAMA_K_QUANTS is stale and needs to be mapped to newer cmake settings
     CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static"
-    if echo "${CGO_CFLAGS}" | grep -- '-g' > /dev/null ; then
+    if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
         CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on ${CMAKE_DEFS}"
     else
         # TODO - add additional optimization flags...
@@ -15,7 +15,7 @@ init_vars() {
 }
 
 git_module_setup() {
-    if [ -n "${OLLAMA_SKIP_PATCHING}" ] ; then
+    if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
         echo "Skipping submodule initialization"
         return
     fi
@@ -25,13 +25,13 @@ git_module_setup() {
 }
 
 apply_patches() {
-    if [ -n "${OLLAMA_SKIP_PATCHING}" ] ; then
+    if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
         echo "Skipping submodule patching"
         return
     fi
     # Workaround git apply not handling creation well for iteration
     rm -f gguf/examples/server/server.h
-    for patch in ${PATCHES} ; do
+    for patch in ${PATCHES}; do
         git -C gguf apply ../patches/${patch}
     done
 }
@@ -39,4 +39,4 @@ apply_patches() {
 build() {
     cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
     cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
-}
\ No newline at end of file
+}
diff --git a/llm/llama.cpp/gen_linux.sh b/llm/llama.cpp/gen_linux.sh
index 3608ddd6..e3cb87a8 100755
--- a/llm/llama.cpp/gen_linux.sh
+++ b/llm/llama.cpp/gen_linux.sh
@@ -1,81 +1,81 @@
 #!/bin/bash
 # This script is intended to run inside the go generate
-# working directory must be ../llm/llama.cpp
+# working directory must be llm/llama.cpp
+
+# First we build our default built-in library which will be linked into the CGO
+# binary as a normal dependency. This default build is CPU based.
+#
+# Then we build a CUDA dynamic library (although statically linked with the CUDA
+# library dependencies for maximum portability)
+#
+# Then if we detect ROCm, we build a dynamically loaded ROCm lib.  ROCm is particularly
+# important to be a dynamic lib even if it's the only GPU library detected because
+# we can't redistribute the objectfiles but must rely on dynamic libraries at
+# runtime, which could lead the server not to start if not present.
 
 set -ex
 set -o pipefail
 
 echo "Starting linux generate script"
-if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ] ; then
+if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ]; then
     export CUDACXX=/usr/local/cuda/bin/nvcc
 fi
+COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_ACCELERATE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
+OLLAMA_DYN_LIB_DIR="gguf/build/lib"
+mkdir -p ${OLLAMA_DYN_LIB_DIR}
+touch ${OLLAMA_DYN_LIB_DIR}/.generated
 source $(dirname $0)/gen_common.sh
 init_vars
 git_module_setup
 apply_patches
-if [ -d /usr/local/cuda/lib64/ ] ; then
-    CMAKE_DEFS="-DLLAMA_CUBLAS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-else
-    CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-fi
-BUILD_DIR="gguf/build/cuda"
-LIB_DIR="${BUILD_DIR}/lib"
-mkdir -p ../../dist/
+
+#
+# CPU first for the default library
+#
+CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
+BUILD_DIR="gguf/build/cpu"
 build
 
-if [ -d /usr/local/cuda/lib64/ ] ; then
-    pwd
-    ar -M <<EOF
-create ${BUILD_DIR}/libollama.a
-addlib ${BUILD_DIR}/examples/server/libext_server.a
-addlib ${BUILD_DIR}/common/libcommon.a
-addlib ${BUILD_DIR}/libllama.a
-addlib ${BUILD_DIR}/libggml_static.a
-addlib /usr/local/cuda/lib64/libcudart_static.a
-addlib /usr/local/cuda/lib64/libcublas_static.a
-addlib /usr/local/cuda/lib64/libcublasLt_static.a
-addlib /usr/local/cuda/lib64/libcudadevrt.a
-addlib /usr/local/cuda/lib64/libculibos.a
-save
-end
-EOF
-else
-    ar -M <<EOF
-create ${BUILD_DIR}/libollama.a
-addlib ${BUILD_DIR}/examples/server/libext_server.a
-addlib ${BUILD_DIR}/common/libcommon.a
-addlib ${BUILD_DIR}/libllama.a
-addlib ${BUILD_DIR}/libggml_static.a
-save
-end
-EOF
+if [ -d /usr/local/cuda/lib64/ ]; then
+    echo "CUDA libraries detected - building dynamic CUDA library"
+    init_vars
+    CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
+    BUILD_DIR="gguf/build/cuda"
+    CUDA_LIB_DIR=/usr/local/cuda/lib64
+    build
+    gcc -fPIC -g -shared -o ${OLLAMA_DYN_LIB_DIR}/libcuda_server.so \
+        -Wl,--whole-archive \
+        ${BUILD_DIR}/examples/server/libext_server.a \
+        ${BUILD_DIR}/common/libcommon.a \
+        ${BUILD_DIR}/libllama.a \
+        -Wl,--no-whole-archive \
+        ${CUDA_LIB_DIR}/libcudart_static.a \
+        ${CUDA_LIB_DIR}/libcublas_static.a \
+        ${CUDA_LIB_DIR}/libcublasLt_static.a \
+        ${CUDA_LIB_DIR}/libcudadevrt.a \
+        ${CUDA_LIB_DIR}/libculibos.a \
+        -lrt -lpthread -ldl -lstdc++ -lm
 fi
 
-if [ -z "${ROCM_PATH}" ] ; then
+if [ -z "${ROCM_PATH}" ]; then
     # Try the default location in case it exists
     ROCM_PATH=/opt/rocm
 fi
 
-if [ -z "${CLBlast_DIR}" ] ; then
+if [ -z "${CLBlast_DIR}" ]; then
     # Try the default location in case it exists
     if [ -d /usr/lib/cmake/CLBlast ]; then
         export CLBlast_DIR=/usr/lib/cmake/CLBlast
     fi
 fi
 
-BUILD_DIR="gguf/build/rocm"
-LIB_DIR="${BUILD_DIR}/lib"
-mkdir -p ${LIB_DIR}
-# Ensure we have at least one file present for the embed
-touch ${LIB_DIR}/.generated 
-
-if [ -d "${ROCM_PATH}" ] ; then
-    echo "Building ROCm"
+if [ -d "${ROCM_PATH}" ]; then
+    echo "ROCm libraries detected - building dynamic ROCm library"
     init_vars
-    CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -DGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102'"
-    CMAKE_DEFS="-DLLAMA_ACCELERATE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -DGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102'"
+    BUILD_DIR="gguf/build/rocm"
     build
-    gcc -fPIC -g -shared -o ${LIB_DIR}/librocm_server.so \
+    gcc -fPIC -g -shared -o ${OLLAMA_DYN_LIB_DIR}/librocm_server.so \
         -Wl,--whole-archive \
         ${BUILD_DIR}/examples/server/libext_server.a \
         ${BUILD_DIR}/common/libcommon.a \
diff --git a/llm/llama.go b/llm/llama.go
index ec067194..6278abc3 100644
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -8,7 +8,6 @@ import (
 	"fmt"
 	"io"
 	"io/fs"
-	"log"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -120,7 +119,7 @@ type ImageData struct {
 var (
 	errNvidiaSMI     = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed")
 	errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only")
-	payloadMissing   = fmt.Errorf("expected payload not included in this build of ollama")
+	payloadMissing   = fmt.Errorf("expected dynamic library payloads not included in this build of ollama")
 )
 
 // StatusWriter is a writer that captures error messages from the llama runner process
@@ -208,41 +207,40 @@ type EmbeddingResponse struct {
 	Embedding []float64 `json:"embedding"`
 }
 
-func extractLib(workDir, glob string) error {
+func extractDynamicLibs(workDir, glob string) ([]string, error) {
 	files, err := fs.Glob(libEmbed, glob)
 	if err != nil || len(files) == 0 {
-		return payloadMissing
+		return nil, payloadMissing
 	}
+	libs := make([]string, len(files))
 
-	if len(files) != 1 {
-		// Shouldn't happen, but just use the first one we find
-		log.Printf("WARNING: multiple payloads detected - using %s", files[0])
-	}
-
-	srcFile, err := libEmbed.Open(files[0])
-	if err != nil {
-		return fmt.Errorf("read payload %s: %v", files[0], err)
-	}
-	defer srcFile.Close()
-	if err := os.MkdirAll(workDir, 0o755); err != nil {
-		return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
-	}
-
-	destFile := filepath.Join(workDir, filepath.Base(files[0]))
-
-	_, err = os.Stat(destFile)
-	switch {
-	case errors.Is(err, os.ErrNotExist):
-		destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
+	for i, file := range files {
+		srcFile, err := libEmbed.Open(file)
 		if err != nil {
-			return fmt.Errorf("write payload %s: %v", files[0], err)
+			return nil, fmt.Errorf("read payload %s: %v", file, err)
 		}
-		defer destFile.Close()
-		if _, err := io.Copy(destFile, srcFile); err != nil {
-			return fmt.Errorf("copy payload %s: %v", files[0], err)
+		defer srcFile.Close()
+		if err := os.MkdirAll(workDir, 0o755); err != nil {
+			return nil, fmt.Errorf("create payload temp dir %s: %v", workDir, err)
+		}
+
+		destFile := filepath.Join(workDir, filepath.Base(file))
+		libs[i] = destFile
+
+		_, err = os.Stat(destFile)
+		switch {
+		case errors.Is(err, os.ErrNotExist):
+			destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
+			if err != nil {
+				return nil, fmt.Errorf("write payload %s: %v", file, err)
+			}
+			defer destFile.Close()
+			if _, err := io.Copy(destFile, srcFile); err != nil {
+				return nil, fmt.Errorf("copy payload %s: %v", file, err)
+			}
+		case err != nil:
+			return nil, fmt.Errorf("stat payload %s: %v", file, err)
 		}
-	case err != nil:
-		return fmt.Errorf("stat payload %s: %v", files[0], err)
 	}
-	return nil
+	return libs, nil
 }
diff --git a/llm/llm.go b/llm/llm.go
index 69ea705f..d581c7ac 100644
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -22,8 +22,7 @@ type LLM interface {
 	Close()
 }
 
-// Set to false on linux/windows if we are able to load the shim
-var ShimPresent = false
+var AvailableShims = map[string]string{}
 
 func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
 	if _, err := os.Stat(model); err != nil {
@@ -82,15 +81,23 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
 	opts.RopeFrequencyBase = 0.0
 	opts.RopeFrequencyScale = 0.0
 	gpuInfo := gpu.GetGPUInfo()
-	if gpuInfo.Driver == "ROCM" && ShimPresent {
-		return newRocmShimExtServer(model, adapters, projectors, ggml.NumLayers(), opts)
-	} else {
-		// Rely on the built-in CUDA/Metal based server which will fall back to CPU
-		return newLlamaExtServer(model, adapters, projectors, ggml.NumLayers(), opts)
-	}
+	return newLlmServer(gpuInfo.Library, model, adapters, projectors, ggml.NumLayers(), opts)
 }
 
 // Give any native cgo implementations an opportunity to initialize
 func Init(workdir string) error {
 	return nativeInit(workdir)
 }
+
+func newLlmServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
+	if _, libPresent := AvailableShims[library]; libPresent && library != "default" {
+		srv, err := newDynamicShimExtServer(AvailableShims[library], model, adapters, projectors, numLayers, opts)
+		if err == nil {
+			return srv, nil
+		}
+		log.Printf("Failed to load dynamic library - falling back to CPU mode %s", err)
+	}
+
+	return newDefaultExtServer(model, adapters, projectors, numLayers, opts)
+
+}
diff --git a/llm/rocm_shim.h b/llm/rocm_shim.h
deleted file mode 100644
index d11ed991..00000000
--- a/llm/rocm_shim.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#include <stdlib.h>
-
-#include "server.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-struct rocm_llama_server {
-  void *handle;
-  void (*llama_server_init)(ext_server_params_t *sparams,
-                            ext_server_resp_t *err);
-  void (*llama_server_start)();
-  void (*llama_server_stop)();
-  void (*llama_server_completion)(const char *json_req,
-                                  ext_server_resp_t *resp);
-  void (*llama_server_completion_next_result)(const int task_id,
-                                              ext_server_task_result_t *result);
-  void (*llama_server_completion_cancel)(const int task_id,
-                                         ext_server_resp_t *err);
-  void (*llama_server_release_task_result)(ext_server_task_result_t *result);
-  void (*llama_server_tokenize)(const char *json_req, char **json_resp,
-                                ext_server_resp_t *err);
-  void (*llama_server_detokenize)(const char *json_req, char **json_resp,
-                                  ext_server_resp_t *err);
-  void (*llama_server_embedding)(const char *json_req, char **json_resp,
-                                 ext_server_resp_t *err);
-  void (*llama_server_release_json_resp)(char **json_resp);
-};
-
-void rocm_shim_init(const char *libPath, struct rocm_llama_server *s,
-                    ext_server_resp_t *err);
-
-// No good way to call C function pointers from Go so inline the indirection
-void rocm_shim_llama_server_init(struct rocm_llama_server s,
-                                 ext_server_params_t *sparams,
-                                 ext_server_resp_t *err);
-
-void rocm_shim_llama_server_start(struct rocm_llama_server s);
-
-void rocm_shim_llama_server_stop(struct rocm_llama_server s);
-
-void rocm_shim_llama_server_completion(struct rocm_llama_server s,
-                                       const char *json_req,
-                                       ext_server_resp_t *resp);
-
-void rocm_shim_llama_server_completion_next_result(
-    struct rocm_llama_server s, const int task_id,
-    ext_server_task_result_t *result);
-
-void rocm_shim_llama_server_completion_cancel(struct rocm_llama_server s,
-                                              const int task_id,
-                                              ext_server_resp_t *err);
-
-void rocm_shim_llama_server_release_task_result(
-    struct rocm_llama_server s, ext_server_task_result_t *result);
-
-void rocm_shim_llama_server_tokenize(struct rocm_llama_server s,
-                                     const char *json_req, char **json_resp,
-                                     ext_server_resp_t *err);
-
-void rocm_shim_llama_server_detokenize(struct rocm_llama_server s,
-                                       const char *json_req, char **json_resp,
-                                       ext_server_resp_t *err);
-
-void rocm_shim_llama_server_embedding(struct rocm_llama_server s,
-                                      const char *json_req, char **json_resp,
-                                      ext_server_resp_t *err);
-void rocm_shim_llama_server_release_json_resp(struct rocm_llama_server s,
-                                              char **json_resp);
-
-#ifdef __cplusplus
-}
-#endif
\ No newline at end of file
diff --git a/llm/shim_darwin.go b/llm/shim_darwin.go
index f63ce8c8..98e7a7d5 100644
--- a/llm/shim_darwin.go
+++ b/llm/shim_darwin.go
@@ -12,13 +12,13 @@ import (
 //go:embed llama.cpp/gguf/ggml-metal.metal
 var libEmbed embed.FS
 
-func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
+func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
 	// should never happen...
-	return nil, fmt.Errorf("ROCM GPUs not supported on Mac")
+	return nil, fmt.Errorf("Dynamic library loading not supported on Mac")
 }
 
 func nativeInit(workdir string) error {
-	err := extractLib(workdir, "llama.cpp/gguf/ggml-metal.metal")
+	_, err := extractDynamicLibs(workdir, "llama.cpp/gguf/ggml-metal.metal")
 	if err != nil {
 		if err == payloadMissing {
 			// TODO perhaps consider this a hard failure on arm macs?
diff --git a/llm/shim_ext_server.go b/llm/shim_ext_server.go
index fa841d49..d9c2df46 100644
--- a/llm/shim_ext_server.go
+++ b/llm/shim_ext_server.go
@@ -5,7 +5,7 @@ package llm
 /*
 
 #include <stdlib.h>
-#include "rocm_shim.h"
+#include "dynamic_shim.h"
 
 */
 import "C"
@@ -18,20 +18,20 @@ import (
 	"log"
 	"os"
 	"path/filepath"
-	"runtime"
+	"strings"
 	"sync"
 	"unsafe"
 
 	"github.com/jmorganca/ollama/api"
 )
 
-//go:embed llama.cpp/gguf/build/*/lib/*
+//go:embed llama.cpp/gguf/build/lib/*
 var libEmbed embed.FS
 
 var RocmShimMissing = fmt.Errorf("ROCm shim library not included in this build of ollama. Radeon GPUs are not supported")
 
 type shimExtServer struct {
-	s       C.struct_rocm_llama_server
+	s       C.struct_dynamic_llama_server
 	options api.Options
 }
 
@@ -40,50 +40,58 @@ var shimMutex sync.Mutex
 var llm *shimExtServer
 
 func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
-	C.rocm_shim_llama_server_init(llm.s, sparams, err)
+	C.dynamic_shim_llama_server_init(llm.s, sparams, err)
 }
 func (llm *shimExtServer) llama_server_start() {
-	C.rocm_shim_llama_server_start(llm.s)
+	C.dynamic_shim_llama_server_start(llm.s)
 }
 func (llm *shimExtServer) llama_server_stop() {
-	C.rocm_shim_llama_server_stop(llm.s)
+	C.dynamic_shim_llama_server_stop(llm.s)
 }
 
 func (llm *shimExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
-	C.rocm_shim_llama_server_completion(llm.s, json_req, resp)
+	C.dynamic_shim_llama_server_completion(llm.s, json_req, resp)
 }
 func (llm *shimExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
-	C.rocm_shim_llama_server_completion_next_result(llm.s, task_id, resp)
+	C.dynamic_shim_llama_server_completion_next_result(llm.s, task_id, resp)
 }
 func (llm *shimExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
-	C.rocm_shim_llama_server_completion_cancel(llm.s, task_id, err)
+	C.dynamic_shim_llama_server_completion_cancel(llm.s, task_id, err)
 }
 func (llm *shimExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
-	C.rocm_shim_llama_server_release_task_result(llm.s, result)
+	C.dynamic_shim_llama_server_release_task_result(llm.s, result)
 }
 
 func (llm *shimExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
-	C.rocm_shim_llama_server_tokenize(llm.s, json_req, json_resp, err)
+	C.dynamic_shim_llama_server_tokenize(llm.s, json_req, json_resp, err)
 }
 func (llm *shimExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
-	C.rocm_shim_llama_server_detokenize(llm.s, json_req, json_resp, err)
+	C.dynamic_shim_llama_server_detokenize(llm.s, json_req, json_resp, err)
 }
 func (llm *shimExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
-	C.rocm_shim_llama_server_embedding(llm.s, json_req, json_resp, err)
+	C.dynamic_shim_llama_server_embedding(llm.s, json_req, json_resp, err)
 }
 func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) {
-	C.rocm_shim_llama_server_release_json_resp(llm.s, json_resp)
+	C.dynamic_shim_llama_server_release_json_resp(llm.s, json_resp)
 }
 
-func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
-	if !ShimPresent {
-		return nil, RocmShimMissing
+func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
+	shimMutex.Lock()
+	defer shimMutex.Unlock()
+	libPath := C.CString(library)
+	defer C.free(unsafe.Pointer(libPath))
+	resp := newExtServerResp(128)
+	defer freeExtServerResp(resp)
+	var srv C.struct_dynamic_llama_server
+	C.dynamic_shim_init(libPath, &srv, &resp)
+	if resp.id < 0 {
+		return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
 	}
-	log.Printf("Loading ROCM llm server")
-	if llm == nil {
-		return nil, fmt.Errorf("nativeInit wasnt called or libary load failed")
+	llm = &shimExtServer{
+		s:       srv,
+		options: opts,
 	}
-	llm.options = opts
+	log.Printf("Loading Dynamic Shim llm server: %s", library)
 	return newExtServer(llm, model, adapters, projectors, numLayers, opts)
 }
 
@@ -108,64 +116,37 @@ func (llm *shimExtServer) Close() {
 }
 
 func nativeInit(workdir string) error {
-	err := extractLib(workdir, "llama.cpp/gguf/build/*/lib/*rocm_server*")
+	libs, err := extractDynamicLibs(workdir, "llama.cpp/gguf/build/lib/*server*")
 	if err != nil {
 		if err == payloadMissing {
-			log.Printf("%s", RocmShimMissing)
+			log.Printf("%s", payloadMissing)
 			return nil
 		}
 		return err
-	} else {
-		ShimPresent = true
+	}
+	for _, lib := range libs {
+		libName := strings.Split(strings.TrimPrefix(filepath.Base(lib), "lib"), ".")[0]
+		AvailableShims[libName] = lib
 	}
 
-	// Verify we have permissions - either running as root, or we have group access to the driver
-	fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
-	if err != nil {
-		if errors.Is(err, fs.ErrPermission) {
-			log.Fatalf("Radeon card detected, but permissions not set up properly.  Either run ollama as root, or add you user account to the render group.")
-			return err
-		} else if errors.Is(err, fs.ErrNotExist) {
-			// expected behavior without a radeon card
-			return nil
+	// Only check ROCm access if we have the dynamic lib loaded
+	if _, rocmPresent := AvailableShims["rocm_server"]; rocmPresent {
+		// Verify we have permissions - either running as root, or we have group access to the driver
+		fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
+		if err != nil {
+			if errors.Is(err, fs.ErrPermission) {
+				log.Fatalf("Radeon card detected, but permissions not set up properly.  Either run ollama as root, or add you user account to the render group.")
+				return err
+			} else if errors.Is(err, fs.ErrNotExist) {
+				// expected behavior without a radeon card
+				return nil
+			}
+
+			return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
 		}
+		fd.Close()
 
-		return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
 	}
-	fd.Close()
 
-	shimMutex.Lock()
-	defer shimMutex.Unlock()
-	if llm != nil {
-		return nil
-	}
-	var libName string
-	switch runtime.GOOS {
-	case "darwin":
-		// shouldn't happen
-		return nil
-	case "linux":
-		libName = "librocm_server.so"
-	case "windows":
-		libName = "rocm_server.dll"
-	default:
-		// shouldn't happen
-		return nil
-	}
-	libPath := C.CString(filepath.Join(workdir, libName))
-	defer C.free(unsafe.Pointer(libPath))
-	resp := newExtServerResp(128)
-	defer freeExtServerResp(resp)
-	var srv C.struct_rocm_llama_server
-	C.rocm_shim_init(libPath, &srv, &resp)
-	if resp.id < 0 {
-		// TODO - consider softening this failure mode to allow fall-back to the CUDA based built-in llm
-		//        and run against CPU
-		return fmt.Errorf("Unable to load AMD GPU library: %s", C.GoString(resp.msg))
-	}
-	llm = &shimExtServer{
-		s:       srv,
-		options: api.DefaultOptions(),
-	}
 	return nil
 }