replace static build in llm

2024-05-18 22:22:46 -07:00 · 2024-05-18 22:22:46 -07:00 · 01ccbc07fe
commit 01ccbc07fe
parent ec09be97e8
67 changed files with 14420 additions and 7669 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,7 +5,6 @@
 .swp
 dist
 ollama
-ggml-metal.metal
 .cache
 *.exe
 .idea
--- a/llama/ggml-backend.c
+++ b/llama/ggml-backend.c
@ -56,6 +56,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
 }

 // backend buffer
+
 GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
               ggml_backend_buffer_type_t      buft,
        struct ggml_backend_buffer_i           iface,
@ -78,10 +79,6 @@ const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
    return buffer->iface.get_name(buffer);
 }

-#define ggml_assert_aligned(ptr) \
-    GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
-
-
 void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
    if (buffer == NULL) {
        return;
--- a/llama/ggml-cuda.cu
+++ b/llama/ggml-cuda.cu
@ -715,9 +715,6 @@ static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
 GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
    delete ctx;
-    
-    // HACK: this needs to be freed in msvc
-    free(buffer);
 }

 GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@ -3031,7 +3028,7 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params,
    GGML_UNUSED(params);
 }

-// extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
+extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();

 GGML_CALL int ggml_backend_cuda_reg_devices() {
    int device_count = ggml_backend_cuda_get_device_count();
--- a/llama/ggml-cuda.h
+++ b/llama/ggml-cuda.h
@ -31,8 +31,6 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_typ
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);

-GGML_API GGML_CALL int ggml_backend_cuda_reg_devices();
-
 GGML_API GGML_CALL int  ggml_backend_cuda_get_device_count(void);
 GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
 GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
--- a/llama/ggml-metal-darwin_arm64.m
+++ b/llama/ggml-metal-darwin_arm64.m
--- a/llama/ggml-metal.metal
+++ b/llama/ggml-metal.metal
--- a/llama/ggml-metal.o
+++ b/llama/ggml-metal.o
--- a/llama/llama.go
+++ b/llama/llama.go
@ -3,14 +3,13 @@ package llama
 // #cgo darwin,arm64 CFLAGS: -std=c11 -DGGML_USE_METAL -DGGML_METAL_EMBED_LIBRARY -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
 // #cgo darwin,arm64 CXXFLAGS: -std=c++11 -DGGML_USE_METAL -DGGML_METAL_EMBED_LIBRARY -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
 // #cgo darwin,amd64 CXXFLAGS: -std=c++11
-// #cgo darwin,arm64 LDFLAGS: ggml-metal.o -framework Foundation -framework Metal -framework MetalKit -framework Accelerate
-// #cgo darwin,amd64 LDFLAGS: -framework Foundation -framework Accelerate
+// #cgo darwin,arm64 LDFLAGS: -ld_classic ${SRCDIR}/ggml-metal.o -framework Foundation -framework Metal -framework MetalKit -framework Accelerate
+// #cgo darwin,amd64 LDFLAGS: -ld_classic -framework Foundation -framework Accelerate
 // #cgo windows LDFLAGS: -lmsvcrt
 // #cgo avx CFLAGS: -mavx
 // #cgo avx CXXFLAGS: -mavx
 // #cgo avx2 CFLAGS: -mavx -mavx2 -mfma
 // #cgo avx2 CXXFLAGS: -mavx -mavx2 -mfma
-// #cgo avx2 LDFLAGS: -lm
 // #cgo cuda CFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_MULTIPLATFORM -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 // #cgo cuda CXXFLAGS: -std=c++11 -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_MULTIPLATFORM -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 // #cgo rocm CXXFLAGS: -std=c++11 -DGGML_USE_CUDA -DGGML_USE_HIPBLAS -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_MULTIPLATFORM -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
@ -24,6 +23,8 @@ import (
 	"runtime"
 	"strings"
 	"unsafe"
+
+	"github.com/ollama/ollama/llm"
 )

 type Token int32
@ -201,3 +202,21 @@ func (m *Model) Tokenize(text string, maxTokens int, addSpecial bool, parseSpeci

 	return tokens, nil
 }
+
+func Quantize(infile, outfile string, ftype llm.FileType) error {
+	cinfile := C.CString(infile)
+	defer C.free(unsafe.Pointer(cinfile))
+
+	coutfile := C.CString(outfile)
+	defer C.free(unsafe.Pointer(coutfile))
+
+	params := C.llama_model_quantize_default_params()
+	params.nthread = -1
+	params.ftype = ftype.Value()
+
+	if rc := C.llama_model_quantize(cinfile, coutfile, &params); rc != 0 {
+		return fmt.Errorf("llama_model_quantize: %d", rc)
+	}
+
+	return nil
+}
--- a/llama/metal.sh
+++ b/llama/metal.sh
@ -1,11 +0,0 @@
-sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal
-TEMP_ASSEMBLY=$(mktemp)
-echo ".section __DATA, __ggml_metallib"   >  $TEMP_ASSEMBLY
-echo ".globl _ggml_metallib_start"        >> $TEMP_ASSEMBLY
-echo "_ggml_metallib_start:"              >> $TEMP_ASSEMBLY
-echo ".incbin \"ggml-metal-embed.metal\"" >> $TEMP_ASSEMBLY
-echo ".globl _ggml_metallib_end"          >> $TEMP_ASSEMBLY
-echo "_ggml_metallib_end:"                >> $TEMP_ASSEMBLY
-as -mmacosx-version-min=11.3 $TEMP_ASSEMBLY -o ggml-metal.o
-rm -f $TEMP_ASSEMBLY
-rm -rf ggml-metal-embed.metal
--- a/llama/runner/README.md
+++ b/llama/runner/README.md
@ -5,5 +5,5 @@
 ```

 ```
-curl POST -H "Content-Type: application/json" -d '{"prompt": "hi"}' http://localhost:8080/
+curl -X POST -H "Content-Type: application/json" -d '{"prompt": "hi"}' http://localhost:8080/
 ```
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@ -23,29 +23,9 @@ type Response struct {
 type Server struct {
 	model *llama.Model
 	lc    *llama.Context
-	batch *llama.Batch
-
-	queue chan Sequence
-	seqs  []*Sequence
-
-	// mu guards seqs
-	mu sync.Mutex
 }

-type Sequence struct {
-	prompt []llama.Token
-	out    chan string
-}
-
-func schedule(parallel int, queue <-chan Sequence) {
-	// Fill sequences from the queue
-
-	// once a sequence finishes, remove it from and add a new one from the queue
-}
-
-func process() {
-	// loop through the sequences, fill a batch, decode and sample tokens, responding to appropriate requests
-}
+var mu sync.Mutex

 func (s *Server) stream(w http.ResponseWriter, r *http.Request) {
 	var request Request
@ -59,23 +39,15 @@ func (s *Server) stream(w http.ResponseWriter, r *http.Request) {
 	w.Header().Set("Transfer-Encoding", "chunked")
 	w.WriteHeader(http.StatusOK)

+	enc := json.NewEncoder(w)
+
+	// main loop
 	tokens, err := s.model.Tokenize(request.Prompt, 2048, true, true)
 	if err != nil {
 		panic(err)
 	}

-	seq := Sequence{prompt: tokens}
-	s.queue <- seq
-
-	// listen for the sequence to finish
-	for {
-		str := <-seq.out
-		if err := json.NewEncoder(w).Encode(&Response{Token: str}); err != nil {
-			log.Println("Failed to encode result:", err)
-			return
-		}
-		w.(http.Flusher).Flush()
-	}
+	batch := llama.NewBatch(512, 0, 1)

 	// prompt eval
 	for i, t := range tokens {
@ -115,7 +87,6 @@ func (s *Server) stream(w http.ResponseWriter, r *http.Request) {

 func main() {
 	mp := flag.String("model", "", "Path to model binary file")
-	parallel := flag.Int("parallel", 1, "Number of parallel requests to handle")
 	flag.Parse()

 	// load the model
@ -131,8 +102,6 @@ func main() {
 	server := &Server{
 		model: model,
 		lc:    lc,
-		queue: make(chan Sequence, 256),
-		seqs:  make([]*Sequence, *parallel),
 	}

 	addr := "127.0.0.1:8080"
--- a/llama/sync.sh
+++ b/llama/sync.sh
@ -23,7 +23,7 @@ cp $src_dir/ggml-quants.c $dst_dir/ggml-quants.c
 cp $src_dir/ggml-quants.h $dst_dir/ggml-quants.h
 cp $src_dir/ggml-metal.metal $dst_dir/ggml-metal.metal
 cp $src_dir/ggml-metal.h $dst_dir/ggml-metal.h
-cp $src_dir/ggml-metal.m $dst_dir/ggml-metal-darwin_arm64.m
+cp $src_dir/ggml-metal.m $dst_dir/ggml-metal.m
 cp $src_dir/ggml-impl.h $dst_dir/ggml-impl.h
 cp $src_dir/ggml-cuda.h $dst_dir/ggml-cuda.h
 cp $src_dir/ggml-cuda.cu $dst_dir/ggml-cuda.cu
@ -34,11 +34,23 @@ cp $src_dir/ggml-backend-impl.h $dst_dir/ggml-backend-impl.h
 cp $src_dir/ggml-alloc.h $dst_dir/ggml-alloc.h
 cp $src_dir/ggml-alloc.c $dst_dir/ggml-alloc.c

-sed -i 's/extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();/\/\/ extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();/' ggml-cuda.cu
-sed -i '34iGGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);' ggml-cuda.h
-
-
 # ggml-cuda
 mkdir -p $dst_dir/ggml-cuda
 cp $src_dir/ggml-cuda/*.cu $dst_dir/ggml-cuda/
 cp $src_dir/ggml-cuda/*.cuh $dst_dir/ggml-cuda/
+
+sed -i 's/extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();/\/\/ extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();/' ggml-cuda.cu
+sed -i '34iGGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);' ggml-cuda.h
+
+# ggml-metal
+sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > temp.metal
+TEMP_ASSEMBLY=$(mktemp)
+echo ".section __DATA, __ggml_metallib"   >  $TEMP_ASSEMBLY
+echo ".globl _ggml_metallib_start"        >> $TEMP_ASSEMBLY
+echo "_ggml_metallib_start:"              >> $TEMP_ASSEMBLY
+echo ".incbin \"temp.metal\"" >> $TEMP_ASSEMBLY
+echo ".globl _ggml_metallib_end"          >> $TEMP_ASSEMBLY
+echo "_ggml_metallib_end:"                >> $TEMP_ASSEMBLY
+as -mmacosx-version-min=11.3 $TEMP_ASSEMBLY -o ggml-metal.o
+rm -f $TEMP_ASSEMBLY
+rm -rf temp.metal
--- a/llm/filetype.go
+++ b/llm/filetype.go
@ -2,10 +2,10 @@ package llm

 import "fmt"

-type fileType uint32
+type FileType uint32

 const (
-	fileTypeF32 fileType = iota
+	fileTypeF32 FileType = iota
 	fileTypeF16
 	fileTypeQ4_0
 	fileTypeQ4_1
@ -41,7 +41,7 @@ const (
 	fileTypeUnknown
 )

-func ParseFileType(s string) (fileType, error) {
+func ParseFileType(s string) (FileType, error) {
 	switch s {
 	case "F32":
 		return fileTypeF32, nil
@ -108,7 +108,7 @@ func ParseFileType(s string) (fileType, error) {
 	}
 }

-func (t fileType) String() string {
+func (t FileType) String() string {
 	switch t {
 	case fileTypeF32:
 		return "F32"
@ -175,6 +175,6 @@ func (t fileType) String() string {
 	}
 }

-func (t fileType) Value() uint32 {
+func (t FileType) Value() uint32 {
 	return uint32(t)
 }
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@ -58,19 +58,6 @@ init_vars
 git_module_setup
 apply_patches

-init_vars
-if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; then
-    # Builds by default, allows skipping, forces build if OLLAMA_CPU_TARGET="static"
-    # Enables optimized Dockerfile builds using a blanket skip and targeted overrides
-    # Static build for linking into the Go binary
-    init_vars
-    CMAKE_TARGETS="--target llama --target ggml"
-    CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off ${CMAKE_DEFS}"
-    BUILD_DIR="../build/linux/${ARCH}_static"
-    echo "Building static library"
-    build
-fi
-
 init_vars
 if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
    # Users building from source can tune the exact flags we pass to cmake for configuring
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@ -177,40 +177,6 @@ function cleanup {
 # -DGGML_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
 # -DGGML_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver

-
-function build_static() {
-    if ((-not "${env:OLLAMA_SKIP_STATIC_GENERATE}") -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "static"))) {
-        # GCC build for direct linking into the Go binary
-        init_vars
-        # cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
-        # as we need this to be compiled by gcc for golang to be able to link with itx
-        write-host "Checking for MinGW..."
-        # error action ensures we exit on failure
-        get-command gcc
-        get-command mingw32-make
-        $oldTargets = $script:cmakeTargets
-        $script:cmakeTargets = @("llama", "ggml")
-        $script:cmakeDefs = @(
-            "-G", "MinGW Makefiles"
-            "-DCMAKE_C_COMPILER=gcc.exe",
-            "-DCMAKE_CXX_COMPILER=g++.exe",
-            "-DBUILD_SHARED_LIBS=off",
-            "-DGGML_NATIVE=off",
-            "-DGGML_AVX=off",
-            "-DGGML_AVX2=off",
-            "-DGGML_AVX512=off",
-            "-DGGML_F16C=off",
-            "-DGGML_FMA=off",
-            "-DGGML_OPENMP=off")
-        $script:buildDir="../build/windows/${script:ARCH}_static"
-        write-host "Building static library"
-        build
-        $script:cmakeTargets = $oldTargets
-    } else {
-        write-host "Skipping CPU generation step as requested"
-    }
-}
-
 function build_cpu($gen_arch) {
    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
        # remaining llama.cpp builds use MSVC 
@ -398,7 +364,6 @@ init_vars
 if ($($args.count) -eq 0) {
    git_module_setup
    apply_patches
-    build_static
    if ($script:ARCH -eq "arm64") {
        build_cpu("ARM64")
    } else { # amd64
--- a/llm/ggml.go
+++ b/llm/ggml.go
@ -55,9 +55,9 @@ func (kv KV) ParameterCount() uint64 {
 	return kv.u64("general.parameter_count")
 }

-func (kv KV) FileType() fileType {
+func (kv KV) FileType() FileType {
 	if u64 := kv.u64("general.file_type"); u64 > 0 {
-		return fileType(uint32(u64))
+		return FileType(uint32(u64))
 	}

 	return fileTypeUnknown
--- a/llm/llm.go
+++ b/llm/llm.go
@ -1,41 +0,0 @@
-package llm
-
-// #cgo CFLAGS: -Illama.cpp -Illama.cpp/include -Illama.cpp/ggml/include
-// #cgo LDFLAGS: -lllama -lggml -lstdc++ -lpthread
-// #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal
-// #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src
-// #cgo windows,amd64 LDFLAGS: -static-libstdc++ -static-libgcc -static -L${SRCDIR}/build/windows/amd64_static -L${SRCDIR}/build/windows/amd64_static/src -L${SRCDIR}/build/windows/amd64_static/ggml/src
-// #cgo windows,arm64 LDFLAGS: -static-libstdc++ -static-libgcc -static -L${SRCDIR}/build/windows/arm64_static -L${SRCDIR}/build/windows/arm64_static/src -L${SRCDIR}/build/windows/arm64_static/ggml/src
-// #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/linux/x86_64_static -L${SRCDIR}/build/linux/x86_64_static/src -L${SRCDIR}/build/linux/x86_64_static/ggml/src
-// #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/linux/arm64_static -L${SRCDIR}/build/linux/arm64_static/src -L${SRCDIR}/build/linux/arm64_static/ggml/src
-// #include <stdlib.h>
-// #include "llama.h"
-import "C"
-
-import (
-	"errors"
-	"unsafe"
-)
-
-// SystemInfo is an unused example of calling llama.cpp functions using CGo
-func SystemInfo() string {
-	return C.GoString(C.llama_print_system_info())
-}
-
-func Quantize(infile, outfile string, ftype fileType) error {
-	cinfile := C.CString(infile)
-	defer C.free(unsafe.Pointer(cinfile))
-
-	coutfile := C.CString(outfile)
-	defer C.free(unsafe.Pointer(coutfile))
-
-	params := C.llama_model_quantize_default_params()
-	params.nthread = -1
-	params.ftype = ftype.Value()
-
-	if rc := C.llama_model_quantize(cinfile, coutfile, &params); rc != 0 {
-		return errors.New("failed to quantize model. This model architecture may not be supported, or you may need to upgrade Ollama to the latest version")
-	}
-
-	return nil
-}
--- a/server/images.go
+++ b/server/images.go
@ -26,6 +26,7 @@ import (
 	"github.com/ollama/ollama/auth"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/llama"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/template"
@ -453,7 +454,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 						defer temp.Close()
 						defer os.Remove(temp.Name())

-						if err := llm.Quantize(blob, temp.Name(), want); err != nil {
+						if err := llama.Quantize(blob, temp.Name(), want); err != nil {
 							return err
 						}