From 923b3294817f9106c40e3dce050f5375141b8da1 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Thu, 26 Sep 2024 15:21:33 -0700
Subject: [PATCH] llama: wire up builtin runner

This adds a new entrypoint into the ollama CLI to run the cgo built runner.
On Mac arm64, this will have GPU support, but on all other platforms it will
be the lowest common denominator CPU build.  After we fully transition
to the new Go runners more tech-debt can be removed and we can stop building
the "default" runner via make and rely on the builtin always.
---
 llama/make/Makefile.default  |  6 +++---
 llama/make/gpu.make          |  2 +-
 llama/runner/cache.go        |  2 +-
 llama/runner/cache_test.go   |  2 +-
 llama/runner/cmd/cmd.go      |  7 +++++++
 llama/runner/requirements.go |  2 +-
 llama/runner/runner.go       |  5 +++--
 llama/runner/stop.go         |  2 +-
 llama/runner/stop_test.go    |  2 +-
 llm/server.go                | 28 ++++++++++++++++++++++------
 main.go                      |  9 +++++++++
 runners/common.go            | 15 ++++++++++++++-
 12 files changed, 64 insertions(+), 18 deletions(-)
 create mode 100644 llama/runner/cmd/cmd.go

diff --git a/llama/make/Makefile.default b/llama/make/Makefile.default
index 95b13a73..aa05d8d0 100644
--- a/llama/make/Makefile.default
+++ b/llama/make/Makefile.default
@@ -24,17 +24,17 @@ all: $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
 $(RUNNERS_BUILD_DIR)/$(DEFAULT_RUNNER)/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS=$(CUSTOM_CPU_FLAGS)
 $(RUNNERS_BUILD_DIR)/$(DEFAULT_RUNNER)/ollama_llama_server$(EXE_EXT): *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
 	@-mkdir -p $(dir $@)
-	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath $(if $(CUSTOM_CPU_FLAGS),-tags $(subst $(space),$(comma),$(CUSTOM_CPU_FLAGS)))  -o $@ ./runner
+	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath $(if $(CUSTOM_CPU_FLAGS),-tags $(subst $(space),$(comma),$(CUSTOM_CPU_FLAGS)))  -o $@ ./runner/cmd
 
 $(RUNNERS_BUILD_DIR)/cpu_avx/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS="avx"
 $(RUNNERS_BUILD_DIR)/cpu_avx/ollama_llama_server$(EXE_EXT): *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
 	@-mkdir -p $(dir $@)
-	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./runner
+	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./runner/cmd
 
 $(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS="avx avx2"
 $(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_llama_server$(EXE_EXT): *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
 	@-mkdir -p $(dir $@)
-	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./runner
+	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./runner/cmd
 
 $(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
 	@-mkdir -p $(dir $@)
diff --git a/llama/make/gpu.make b/llama/make/gpu.make
index 939fa41a..cb154265 100644
--- a/llama/make/gpu.make
+++ b/llama/make/gpu.make
@@ -82,7 +82,7 @@ $(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cpp
 $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/" $(CGO_EXTRA_LDFLAGS)
 $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
 	@-mkdir -p $(dir $@)
-	GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie  $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner
+	GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie  $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner/cmd
 $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(DIST_GPU_RUNNER_LIB_DEPS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
 	@-mkdir -p $(dir $@)
 	$(CCACHE) $(GPU_COMPILER) --shared $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@
diff --git a/llama/runner/cache.go b/llama/runner/cache.go
index ef8f6cfb..3baa305b 100644
--- a/llama/runner/cache.go
+++ b/llama/runner/cache.go
@@ -1,4 +1,4 @@
-package main
+package runner
 
 import (
 	"errors"
diff --git a/llama/runner/cache_test.go b/llama/runner/cache_test.go
index cc13b5f2..30e74d96 100644
--- a/llama/runner/cache_test.go
+++ b/llama/runner/cache_test.go
@@ -1,4 +1,4 @@
-package main
+package runner
 
 import (
 	"reflect"
diff --git a/llama/runner/cmd/cmd.go b/llama/runner/cmd/cmd.go
new file mode 100644
index 00000000..26b56af6
--- /dev/null
+++ b/llama/runner/cmd/cmd.go
@@ -0,0 +1,7 @@
+package main
+
+import "github.com/ollama/ollama/llama/runner"
+
+func main() {
+	runner.RunnerMain()
+}
diff --git a/llama/runner/requirements.go b/llama/runner/requirements.go
index 71b3b9aa..4511559d 100644
--- a/llama/runner/requirements.go
+++ b/llama/runner/requirements.go
@@ -1,4 +1,4 @@
-package main
+package runner
 
 import (
 	"encoding/json"
diff --git a/llama/runner/runner.go b/llama/runner/runner.go
index f472d076..29183faf 100644
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -1,4 +1,4 @@
-package main
+package runner
 
 import (
 	"context"
@@ -827,7 +827,7 @@ func (s *Server) loadModel(
 	s.ready.Done()
 }
 
-func main() {
+func RunnerMain() {
 	mpath := flag.String("model", "", "Path to model binary file")
 	ppath := flag.String("mmproj", "", "Path to projector binary file")
 	parallel := flag.Int("parallel", 1, "Number of sequences to handle simultaneously")
@@ -917,6 +917,7 @@ func main() {
 	listener, err := net.Listen("tcp", addr)
 	if err != nil {
 		fmt.Println("Listen error:", err)
+		cancel()
 		return
 	}
 	defer listener.Close()
diff --git a/llama/runner/stop.go b/llama/runner/stop.go
index c05f5e3d..8dcb08d3 100644
--- a/llama/runner/stop.go
+++ b/llama/runner/stop.go
@@ -1,4 +1,4 @@
-package main
+package runner
 
 import (
 	"strings"
diff --git a/llama/runner/stop_test.go b/llama/runner/stop_test.go
index 51b35fde..31dc161f 100644
--- a/llama/runner/stop_test.go
+++ b/llama/runner/stop_test.go
@@ -1,4 +1,4 @@
-package main
+package runner
 
 import (
 	"reflect"
diff --git a/llm/server.go b/llm/server.go
index a4c99dd9..8b5b0b76 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -158,7 +158,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		return nil, finalErr
 	}
 	var servers []string
-	if cpuRunner != "" {
+	if cpuRunner != "" && rDir != "" {
 		servers = []string{cpuRunner}
 	} else {
 		servers = runners.ServersForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
@@ -270,6 +270,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 	}
 
 	for i := range servers {
+		builtin := servers[i] == "builtin"
 		dir := availableServers[servers[i]]
 		if dir == "" {
 			// Shouldn't happen
@@ -278,7 +279,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 			continue
 		}
 
-		if strings.HasPrefix(servers[i], "cpu") {
+		if strings.HasPrefix(servers[i], "cpu") || (builtin && !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64")) {
 			gpus = discover.GetCPUInfo()
 		}
 
@@ -295,7 +296,12 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 			slog.Debug("ResolveTCPAddr failed ", "error", err)
 			port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
 		}
-		finalParams := append(params, "--port", strconv.Itoa(port))
+		finalParams := []string{}
+		if builtin {
+			finalParams = []string{"_runner"}
+		}
+		finalParams = append(finalParams, params...)
+		finalParams = append(finalParams, "--port", strconv.Itoa(port))
 
 		pathEnv := "LD_LIBRARY_PATH"
 		if runtime.GOOS == "windows" {
@@ -316,9 +322,19 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 			libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...)
 		}
 
-		server := filepath.Join(dir, "ollama_llama_server")
-		if runtime.GOOS == "windows" {
-			server += ".exe"
+		var server string
+		if builtin {
+			exe, err := os.Executable()
+			if err != nil {
+				slog.Warn("executable lookup failure", "error", err)
+				continue
+			}
+			server = exe
+		} else {
+			server = filepath.Join(dir, "ollama_llama_server")
+			if runtime.GOOS == "windows" {
+				server += ".exe"
+			}
 		}
 
 		// Detect tmp cleaners wiping out the file
diff --git a/main.go b/main.go
index 650e03a6..3f07eb40 100644
--- a/main.go
+++ b/main.go
@@ -2,12 +2,21 @@ package main
 
 import (
 	"context"
+	"os"
 
 	"github.com/spf13/cobra"
 
 	"github.com/ollama/ollama/cmd"
+	"github.com/ollama/ollama/llama/runner"
 )
 
 func main() {
+	if len(os.Args) >= 2 {
+		if os.Args[1] == "_runner" {
+			os.Args = append([]string{os.Args[0]}, os.Args[2:]...)
+			runner.RunnerMain()
+			return
+		}
+	}
 	cobra.CheckErr(cmd.NewCLI().ExecuteContext(context.Background()))
 }
diff --git a/runners/common.go b/runners/common.go
index 19014d75..c26a208c 100644
--- a/runners/common.go
+++ b/runners/common.go
@@ -105,7 +105,9 @@ func locateRunners() (string, error) {
 			return candidate, nil
 		}
 	}
-	return "", fmt.Errorf("unable to locate runners in any search path %v", paths)
+	// Fall back to built-in
+	slog.Debug("unable to locate runners, using built-in")
+	return "", nil
 }
 
 // Return true if we're carying nested payloads for the runners
@@ -276,6 +278,11 @@ func cleanupTmpDirs() {
 // lowest common denominator
 func GetAvailableServers(payloadsDir string) map[string]string {
 	if payloadsDir == "" {
+		exe, err := os.Executable()
+		if err == nil {
+			slog.Debug("Wiring up built-in runner")
+			return map[string]string{"builtin": filepath.Dir(exe)}
+		}
 		slog.Error("empty runner dir")
 		return nil
 	}
@@ -304,6 +311,12 @@ func GetAvailableServers(payloadsDir string) map[string]string {
 func ServersForGpu(info discover.GpuInfo) []string {
 	// glob workDir for files that start with ollama_
 	availableServers := GetAvailableServers(runnersDir)
+
+	// Short circuit if the only option is built-in
+	if _, ok := availableServers["builtin"]; ok {
+		return []string{"builtin"}
+	}
+
 	requested := info.Library
 	if info.Variant != discover.CPUCapabilityNone.String() {
 		requested += "_" + info.Variant