llama: wire up builtin runner
This adds a new entrypoint into the ollama CLI to run the cgo built runner. On Mac arm64, this will have GPU support, but on all other platforms it will be the lowest common denominator CPU build. After we fully transition to the new Go runners more tech-debt can be removed and we can stop building the "default" runner via make and rely on the builtin always.
This commit is contained in:
parent
078f666f73
commit
923b329481
@ -24,17 +24,17 @@ all: $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
|
||||
$(RUNNERS_BUILD_DIR)/$(DEFAULT_RUNNER)/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS=$(CUSTOM_CPU_FLAGS)
|
||||
$(RUNNERS_BUILD_DIR)/$(DEFAULT_RUNNER)/ollama_llama_server$(EXE_EXT): *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
|
||||
@-mkdir -p $(dir $@)
|
||||
GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath $(if $(CUSTOM_CPU_FLAGS),-tags $(subst $(space),$(comma),$(CUSTOM_CPU_FLAGS))) -o $@ ./runner
|
||||
GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath $(if $(CUSTOM_CPU_FLAGS),-tags $(subst $(space),$(comma),$(CUSTOM_CPU_FLAGS))) -o $@ ./runner/cmd
|
||||
|
||||
$(RUNNERS_BUILD_DIR)/cpu_avx/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS="avx"
|
||||
$(RUNNERS_BUILD_DIR)/cpu_avx/ollama_llama_server$(EXE_EXT): *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
|
||||
@-mkdir -p $(dir $@)
|
||||
GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./runner
|
||||
GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./runner/cmd
|
||||
|
||||
$(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS="avx avx2"
|
||||
$(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_llama_server$(EXE_EXT): *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
|
||||
@-mkdir -p $(dir $@)
|
||||
GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./runner
|
||||
GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./runner/cmd
|
||||
|
||||
$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
|
||||
@-mkdir -p $(dir $@)
|
||||
|
@ -82,7 +82,7 @@ $(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cpp
|
||||
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/" $(CGO_EXTRA_LDFLAGS)
|
||||
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
|
||||
@-mkdir -p $(dir $@)
|
||||
GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner
|
||||
GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner/cmd
|
||||
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(DIST_GPU_RUNNER_LIB_DEPS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
|
||||
@-mkdir -p $(dir $@)
|
||||
$(CCACHE) $(GPU_COMPILER) --shared $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@
|
||||
|
@ -1,4 +1,4 @@
|
||||
package main
|
||||
package runner
|
||||
|
||||
import (
|
||||
"errors"
|
||||
|
@ -1,4 +1,4 @@
|
||||
package main
|
||||
package runner
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
|
7
llama/runner/cmd/cmd.go
Normal file
7
llama/runner/cmd/cmd.go
Normal file
@ -0,0 +1,7 @@
|
||||
package main
|
||||
|
||||
import "github.com/ollama/ollama/llama/runner"
|
||||
|
||||
func main() {
|
||||
runner.RunnerMain()
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package main
|
||||
package runner
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
|
@ -1,4 +1,4 @@
|
||||
package main
|
||||
package runner
|
||||
|
||||
import (
|
||||
"context"
|
||||
@ -827,7 +827,7 @@ func (s *Server) loadModel(
|
||||
s.ready.Done()
|
||||
}
|
||||
|
||||
func main() {
|
||||
func RunnerMain() {
|
||||
mpath := flag.String("model", "", "Path to model binary file")
|
||||
ppath := flag.String("mmproj", "", "Path to projector binary file")
|
||||
parallel := flag.Int("parallel", 1, "Number of sequences to handle simultaneously")
|
||||
@ -917,6 +917,7 @@ func main() {
|
||||
listener, err := net.Listen("tcp", addr)
|
||||
if err != nil {
|
||||
fmt.Println("Listen error:", err)
|
||||
cancel()
|
||||
return
|
||||
}
|
||||
defer listener.Close()
|
||||
|
@ -1,4 +1,4 @@
|
||||
package main
|
||||
package runner
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
@ -1,4 +1,4 @@
|
||||
package main
|
||||
package runner
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
|
@ -158,7 +158,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||
return nil, finalErr
|
||||
}
|
||||
var servers []string
|
||||
if cpuRunner != "" {
|
||||
if cpuRunner != "" && rDir != "" {
|
||||
servers = []string{cpuRunner}
|
||||
} else {
|
||||
servers = runners.ServersForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
|
||||
@ -270,6 +270,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||
}
|
||||
|
||||
for i := range servers {
|
||||
builtin := servers[i] == "builtin"
|
||||
dir := availableServers[servers[i]]
|
||||
if dir == "" {
|
||||
// Shouldn't happen
|
||||
@ -278,7 +279,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||
continue
|
||||
}
|
||||
|
||||
if strings.HasPrefix(servers[i], "cpu") {
|
||||
if strings.HasPrefix(servers[i], "cpu") || (builtin && !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64")) {
|
||||
gpus = discover.GetCPUInfo()
|
||||
}
|
||||
|
||||
@ -295,7 +296,12 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||
slog.Debug("ResolveTCPAddr failed ", "error", err)
|
||||
port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
|
||||
}
|
||||
finalParams := append(params, "--port", strconv.Itoa(port))
|
||||
finalParams := []string{}
|
||||
if builtin {
|
||||
finalParams = []string{"_runner"}
|
||||
}
|
||||
finalParams = append(finalParams, params...)
|
||||
finalParams = append(finalParams, "--port", strconv.Itoa(port))
|
||||
|
||||
pathEnv := "LD_LIBRARY_PATH"
|
||||
if runtime.GOOS == "windows" {
|
||||
@ -316,9 +322,19 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||
libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...)
|
||||
}
|
||||
|
||||
server := filepath.Join(dir, "ollama_llama_server")
|
||||
if runtime.GOOS == "windows" {
|
||||
server += ".exe"
|
||||
var server string
|
||||
if builtin {
|
||||
exe, err := os.Executable()
|
||||
if err != nil {
|
||||
slog.Warn("executable lookup failure", "error", err)
|
||||
continue
|
||||
}
|
||||
server = exe
|
||||
} else {
|
||||
server = filepath.Join(dir, "ollama_llama_server")
|
||||
if runtime.GOOS == "windows" {
|
||||
server += ".exe"
|
||||
}
|
||||
}
|
||||
|
||||
// Detect tmp cleaners wiping out the file
|
||||
|
9
main.go
9
main.go
@ -2,12 +2,21 @@ package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
|
||||
"github.com/ollama/ollama/cmd"
|
||||
"github.com/ollama/ollama/llama/runner"
|
||||
)
|
||||
|
||||
func main() {
|
||||
if len(os.Args) >= 2 {
|
||||
if os.Args[1] == "_runner" {
|
||||
os.Args = append([]string{os.Args[0]}, os.Args[2:]...)
|
||||
runner.RunnerMain()
|
||||
return
|
||||
}
|
||||
}
|
||||
cobra.CheckErr(cmd.NewCLI().ExecuteContext(context.Background()))
|
||||
}
|
||||
|
@ -105,7 +105,9 @@ func locateRunners() (string, error) {
|
||||
return candidate, nil
|
||||
}
|
||||
}
|
||||
return "", fmt.Errorf("unable to locate runners in any search path %v", paths)
|
||||
// Fall back to built-in
|
||||
slog.Debug("unable to locate runners, using built-in")
|
||||
return "", nil
|
||||
}
|
||||
|
||||
// Return true if we're carying nested payloads for the runners
|
||||
@ -276,6 +278,11 @@ func cleanupTmpDirs() {
|
||||
// lowest common denominator
|
||||
func GetAvailableServers(payloadsDir string) map[string]string {
|
||||
if payloadsDir == "" {
|
||||
exe, err := os.Executable()
|
||||
if err == nil {
|
||||
slog.Debug("Wiring up built-in runner")
|
||||
return map[string]string{"builtin": filepath.Dir(exe)}
|
||||
}
|
||||
slog.Error("empty runner dir")
|
||||
return nil
|
||||
}
|
||||
@ -304,6 +311,12 @@ func GetAvailableServers(payloadsDir string) map[string]string {
|
||||
func ServersForGpu(info discover.GpuInfo) []string {
|
||||
// glob workDir for files that start with ollama_
|
||||
availableServers := GetAvailableServers(runnersDir)
|
||||
|
||||
// Short circuit if the only option is built-in
|
||||
if _, ok := availableServers["builtin"]; ok {
|
||||
return []string{"builtin"}
|
||||
}
|
||||
|
||||
requested := info.Library
|
||||
if info.Variant != discover.CPUCapabilityNone.String() {
|
||||
requested += "_" + info.Variant
|
||||
|
Loading…
x
Reference in New Issue
Block a user