diff --git a/llama/make/Makefile.default b/llama/make/Makefile.default
index 95b13a73..aa05d8d0 100644
--- a/llama/make/Makefile.default
+++ b/llama/make/Makefile.default
@@ -24,17 +24,17 @@ all: $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
 $(RUNNERS_BUILD_DIR)/$(DEFAULT_RUNNER)/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS=$(CUSTOM_CPU_FLAGS)
 $(RUNNERS_BUILD_DIR)/$(DEFAULT_RUNNER)/ollama_llama_server$(EXE_EXT): *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
 	@-mkdir -p $(dir $@)
-	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath $(if $(CUSTOM_CPU_FLAGS),-tags $(subst $(space),$(comma),$(CUSTOM_CPU_FLAGS)))  -o $@ ./runner
+	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath $(if $(CUSTOM_CPU_FLAGS),-tags $(subst $(space),$(comma),$(CUSTOM_CPU_FLAGS)))  -o $@ ./runner/cmd
 
 $(RUNNERS_BUILD_DIR)/cpu_avx/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS="avx"
 $(RUNNERS_BUILD_DIR)/cpu_avx/ollama_llama_server$(EXE_EXT): *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
 	@-mkdir -p $(dir $@)
-	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./runner
+	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./runner/cmd
 
 $(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS="avx avx2"
 $(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_llama_server$(EXE_EXT): *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
 	@-mkdir -p $(dir $@)
-	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./runner
+	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./runner/cmd
 
 $(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
 	@-mkdir -p $(dir $@)
diff --git a/llama/make/gpu.make b/llama/make/gpu.make
index 939fa41a..cb154265 100644
--- a/llama/make/gpu.make
+++ b/llama/make/gpu.make
@@ -82,7 +82,7 @@ $(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cpp
 $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/" $(CGO_EXTRA_LDFLAGS)
 $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
 	@-mkdir -p $(dir $@)
-	GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie  $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner
+	GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie  $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner/cmd
 $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(DIST_GPU_RUNNER_LIB_DEPS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
 	@-mkdir -p $(dir $@)
 	$(CCACHE) $(GPU_COMPILER) --shared $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@
diff --git a/llama/runner/cache.go b/llama/runner/cache.go
index ef8f6cfb..3baa305b 100644
--- a/llama/runner/cache.go
+++ b/llama/runner/cache.go
@@ -1,4 +1,4 @@
-package main
+package runner
 
 import (
 	"errors"
diff --git a/llama/runner/cache_test.go b/llama/runner/cache_test.go
index cc13b5f2..30e74d96 100644
--- a/llama/runner/cache_test.go
+++ b/llama/runner/cache_test.go
@@ -1,4 +1,4 @@
-package main
+package runner
 
 import (
 	"reflect"
diff --git a/llama/runner/cmd/cmd.go b/llama/runner/cmd/cmd.go
new file mode 100644
index 00000000..26b56af6
--- /dev/null
+++ b/llama/runner/cmd/cmd.go
@@ -0,0 +1,7 @@
+package main
+
+import "github.com/ollama/ollama/llama/runner"
+
+func main() {
+	runner.RunnerMain()
+}
diff --git a/llama/runner/requirements.go b/llama/runner/requirements.go
index 71b3b9aa..4511559d 100644
--- a/llama/runner/requirements.go
+++ b/llama/runner/requirements.go
@@ -1,4 +1,4 @@
-package main
+package runner
 
 import (
 	"encoding/json"
diff --git a/llama/runner/runner.go b/llama/runner/runner.go
index f472d076..29183faf 100644
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -1,4 +1,4 @@
-package main
+package runner
 
 import (
 	"context"
@@ -827,7 +827,7 @@ func (s *Server) loadModel(
 	s.ready.Done()
 }
 
-func main() {
+func RunnerMain() {
 	mpath := flag.String("model", "", "Path to model binary file")
 	ppath := flag.String("mmproj", "", "Path to projector binary file")
 	parallel := flag.Int("parallel", 1, "Number of sequences to handle simultaneously")
@@ -917,6 +917,7 @@ func main() {
 	listener, err := net.Listen("tcp", addr)
 	if err != nil {
 		fmt.Println("Listen error:", err)
+		cancel()
 		return
 	}
 	defer listener.Close()
diff --git a/llama/runner/stop.go b/llama/runner/stop.go
index c05f5e3d..8dcb08d3 100644
--- a/llama/runner/stop.go
+++ b/llama/runner/stop.go
@@ -1,4 +1,4 @@
-package main
+package runner
 
 import (
 	"strings"
diff --git a/llama/runner/stop_test.go b/llama/runner/stop_test.go
index 51b35fde..31dc161f 100644
--- a/llama/runner/stop_test.go
+++ b/llama/runner/stop_test.go
@@ -1,4 +1,4 @@
-package main
+package runner
 
 import (
 	"reflect"
diff --git a/llm/server.go b/llm/server.go
index a4c99dd9..8b5b0b76 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -158,7 +158,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		return nil, finalErr
 	}
 	var servers []string
-	if cpuRunner != "" {
+	if cpuRunner != "" && rDir != "" {
 		servers = []string{cpuRunner}
 	} else {
 		servers = runners.ServersForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
@@ -270,6 +270,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 	}
 
 	for i := range servers {
+		builtin := servers[i] == "builtin"
 		dir := availableServers[servers[i]]
 		if dir == "" {
 			// Shouldn't happen
@@ -278,7 +279,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 			continue
 		}
 
-		if strings.HasPrefix(servers[i], "cpu") {
+		if strings.HasPrefix(servers[i], "cpu") || (builtin && !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64")) {
 			gpus = discover.GetCPUInfo()
 		}
 
@@ -295,7 +296,12 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 			slog.Debug("ResolveTCPAddr failed ", "error", err)
 			port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
 		}
-		finalParams := append(params, "--port", strconv.Itoa(port))
+		finalParams := []string{}
+		if builtin {
+			finalParams = []string{"_runner"}
+		}
+		finalParams = append(finalParams, params...)
+		finalParams = append(finalParams, "--port", strconv.Itoa(port))
 
 		pathEnv := "LD_LIBRARY_PATH"
 		if runtime.GOOS == "windows" {
@@ -316,9 +322,19 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 			libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...)
 		}
 
-		server := filepath.Join(dir, "ollama_llama_server")
-		if runtime.GOOS == "windows" {
-			server += ".exe"
+		var server string
+		if builtin {
+			exe, err := os.Executable()
+			if err != nil {
+				slog.Warn("executable lookup failure", "error", err)
+				continue
+			}
+			server = exe
+		} else {
+			server = filepath.Join(dir, "ollama_llama_server")
+			if runtime.GOOS == "windows" {
+				server += ".exe"
+			}
 		}
 
 		// Detect tmp cleaners wiping out the file
diff --git a/main.go b/main.go
index 650e03a6..3f07eb40 100644
--- a/main.go
+++ b/main.go
@@ -2,12 +2,21 @@ package main
 
 import (
 	"context"
+	"os"
 
 	"github.com/spf13/cobra"
 
 	"github.com/ollama/ollama/cmd"
+	"github.com/ollama/ollama/llama/runner"
 )
 
 func main() {
+	if len(os.Args) >= 2 {
+		if os.Args[1] == "_runner" {
+			os.Args = append([]string{os.Args[0]}, os.Args[2:]...)
+			runner.RunnerMain()
+			return
+		}
+	}
 	cobra.CheckErr(cmd.NewCLI().ExecuteContext(context.Background()))
 }
diff --git a/runners/common.go b/runners/common.go
index 19014d75..c26a208c 100644
--- a/runners/common.go
+++ b/runners/common.go
@@ -105,7 +105,9 @@ func locateRunners() (string, error) {
 			return candidate, nil
 		}
 	}
-	return "", fmt.Errorf("unable to locate runners in any search path %v", paths)
+	// Fall back to built-in
+	slog.Debug("unable to locate runners, using built-in")
+	return "", nil
 }
 
 // Return true if we're carying nested payloads for the runners
@@ -276,6 +278,11 @@ func cleanupTmpDirs() {
 // lowest common denominator
 func GetAvailableServers(payloadsDir string) map[string]string {
 	if payloadsDir == "" {
+		exe, err := os.Executable()
+		if err == nil {
+			slog.Debug("Wiring up built-in runner")
+			return map[string]string{"builtin": filepath.Dir(exe)}
+		}
 		slog.Error("empty runner dir")
 		return nil
 	}
@@ -304,6 +311,12 @@ func GetAvailableServers(payloadsDir string) map[string]string {
 func ServersForGpu(info discover.GpuInfo) []string {
 	// glob workDir for files that start with ollama_
 	availableServers := GetAvailableServers(runnersDir)
+
+	// Short circuit if the only option is built-in
+	if _, ok := availableServers["builtin"]; ok {
+		return []string{"builtin"}
+	}
+
 	requested := info.Library
 	if info.Variant != discover.CPUCapabilityNone.String() {
 		requested += "_" + info.Variant