diff --git a/llama/example/main.go b/llama/example/main.go index dde52440..f21431b2 100644 --- a/llama/example/main.go +++ b/llama/example/main.go @@ -6,6 +6,7 @@ import ( "io" "log" "os" + "runtime" "strings" "github.com/ollama/ollama/llama" @@ -28,9 +29,11 @@ func main() { // load the model llama.BackendInit() - params := llama.NewModelParams() + params := llama.NewModelParams(999, 0, func(p float32) { + fmt.Printf("loading... %f\n", p) + }) model := llama.LoadModelFromFile(*mpath, params) - ctxParams := llama.NewContextParams() + ctxParams := llama.NewContextParams(2048, runtime.NumCPU(), false) // language model context lc := llama.NewContextWithModel(model, ctxParams) @@ -65,7 +68,7 @@ func main() { panic("prompt must contain exactly one ") } - beforeTokens, err := lc.Model().Tokenize(parts[0], 2048, true, true) + beforeTokens, err := lc.Model().Tokenize(parts[0], true, true) if err != nil { panic(err) } @@ -82,7 +85,7 @@ func main() { llama.LlavaEvalImageEmbed(lc, embedding, 512, &nPast) - afterTokens, err := lc.Model().Tokenize(parts[1], 2048, true, true) + afterTokens, err := lc.Model().Tokenize(parts[1], true, true) if err != nil { panic(err) } @@ -92,7 +95,7 @@ func main() { nPast++ } } else { - tokens, err := lc.Model().Tokenize(*prompt, 2048, true, true) + tokens, err := lc.Model().Tokenize(*prompt, true, true) if err != nil { panic(err) } diff --git a/llm/filetype.go b/llm/filetype.go index a1c2f9d7..7a8e9f69 100644 --- a/llm/filetype.go +++ b/llm/filetype.go @@ -2,10 +2,10 @@ package llm import "fmt" -type FileType uint32 +type fileType uint32 const ( - fileTypeF32 FileType = iota + fileTypeF32 fileType = iota fileTypeF16 fileTypeQ4_0 fileTypeQ4_1 @@ -41,7 +41,7 @@ const ( fileTypeUnknown ) -func ParseFileType(s string) (FileType, error) { +func ParseFileType(s string) (fileType, error) { switch s { case "F32": return fileTypeF32, nil @@ -108,7 +108,7 @@ func ParseFileType(s string) (FileType, error) { } } -func (t FileType) String() string { +func (t fileType) String() string { switch t { case fileTypeF32: return "F32" @@ -175,6 +175,6 @@ func (t FileType) String() string { } } -func (t FileType) Value() uint32 { +func (t fileType) Value() uint32 { return uint32(t) } diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh index 0c42f9f7..acea9c8d 100755 --- a/llm/generate/gen_darwin.sh +++ b/llm/generate/gen_darwin.sh @@ -100,4 +100,4 @@ esac cleanup wait_for_compress -echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)" \ No newline at end of file +echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)" diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index c8cd3567..d8f56ec9 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -58,6 +58,19 @@ init_vars git_module_setup apply_patches +init_vars +if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; then + # Builds by default, allows skipping, forces build if OLLAMA_CPU_TARGET="static" + # Enables optimized Dockerfile builds using a blanket skip and targeted overrides + # Static build for linking into the Go binary + init_vars + CMAKE_TARGETS="--target llama --target ggml" + CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + BUILD_DIR="../build/linux/${ARCH}_static" + echo "Building static library" + build +fi + init_vars if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then # Users building from source can tune the exact flags we pass to cmake for configuring diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index d2914189..da1c9890 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -177,6 +177,39 @@ function cleanup { # -DGGML_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen # -DGGML_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver + +function build_static() { + if ((-not "${env:OLLAMA_SKIP_STATIC_GENERATE}") -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "static"))) { + # GCC build for direct linking into the Go binary + init_vars + # cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast + # as we need this to be compiled by gcc for golang to be able to link with itx + write-host "Checking for MinGW..." + # error action ensures we exit on failure + get-command gcc + get-command mingw32-make + $oldTargets = $script:cmakeTargets + $script:cmakeTargets = @("llama", "ggml") + $script:cmakeDefs = @( + "-G", "MinGW Makefiles" + "-DCMAKE_C_COMPILER=gcc.exe", + "-DCMAKE_CXX_COMPILER=g++.exe", + "-DBUILD_SHARED_LIBS=off", + "-DLLAMA_NATIVE=off", + "-DLLAMA_AVX=off", + "-DLLAMA_AVX2=off", + "-DLLAMA_AVX512=off", + "-DLLAMA_F16C=off", + "-DLLAMA_FMA=off") + $script:buildDir="../build/windows/${script:ARCH}_static" + write-host "Building static library" + build + $script:cmakeTargets = $oldTargets + } else { + write-host "Skipping CPU generation step as requested" + } +} + function build_cpu($gen_arch) { if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) { # remaining llama.cpp builds use MSVC @@ -364,6 +397,7 @@ init_vars if ($($args.count) -eq 0) { git_module_setup apply_patches + build_static if ($script:ARCH -eq "arm64") { build_cpu("ARM64") } else { # amd64 diff --git a/llm/ggml.go b/llm/ggml.go index bec26f93..ab436095 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -55,9 +55,9 @@ func (kv KV) ParameterCount() uint64 { return kv.u64("general.parameter_count") } -func (kv KV) FileType() FileType { +func (kv KV) FileType() fileType { if u64 := kv.u64("general.file_type"); u64 > 0 { - return FileType(uint32(u64)) + return fileType(uint32(u64)) } return fileTypeUnknown diff --git a/llm/llm.go b/llm/llm.go new file mode 100644 index 00000000..2a0c4b91 --- /dev/null +++ b/llm/llm.go @@ -0,0 +1,39 @@ +package llm + +// #cgo CFLAGS: -Illama.cpp +// #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/libllama.a -lstdc++ +// #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/libllama.a -lstdc++ +// #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/libllama.a -static -lstdc++ +// #cgo windows,arm64 LDFLAGS: ${SRCDIR}/build/windows/arm64_static/libllama.a -static -lstdc++ +// #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/libllama.a -lstdc++ +// #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/libllama.a -lstdc++ +// #include +// #include "llama.h" +import "C" +import ( + "fmt" + "unsafe" +) + +// SystemInfo is an unused example of calling llama.cpp functions using CGo +func SystemInfo() string { + return C.GoString(C.llama_print_system_info()) +} + +func Quantize(infile, outfile string, ftype fileType) error { + cinfile := C.CString(infile) + defer C.free(unsafe.Pointer(cinfile)) + + coutfile := C.CString(outfile) + defer C.free(unsafe.Pointer(coutfile)) + + params := C.llama_model_quantize_default_params() + params.nthread = -1 + params.ftype = ftype.Value() + + if rc := C.llama_model_quantize(cinfile, coutfile, ¶ms); rc != 0 { + return fmt.Errorf("llama_model_quantize: %d", rc) + } + + return nil +} diff --git a/server/images.go b/server/images.go index ede5fc00..b5bf7ad6 100644 --- a/server/images.go +++ b/server/images.go @@ -26,7 +26,6 @@ import ( "github.com/ollama/ollama/auth" "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" - "github.com/ollama/ollama/llama" "github.com/ollama/ollama/llm" "github.com/ollama/ollama/parser" "github.com/ollama/ollama/template" @@ -454,7 +453,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio defer temp.Close() defer os.Remove(temp.Name()) - if err := llama.Quantize(blob, temp.Name(), want); err != nil { + if err := llm.Quantize(blob, temp.Name(), want); err != nil { return err }