replace static build in llm

This commit is contained in:
jmorganca 2024-05-18 22:22:46 -07:00
parent ec09be97e8
commit 01ccbc07fe
67 changed files with 14420 additions and 7669 deletions

1
.gitignore vendored
View File

@ -5,7 +5,6 @@
.swp
dist
ollama
ggml-metal.metal
.cache
*.exe
.idea

View File

@ -56,6 +56,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
}
// backend buffer
GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
ggml_backend_buffer_type_t buft,
struct ggml_backend_buffer_i iface,
@ -78,10 +79,6 @@ const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
return buffer->iface.get_name(buffer);
}
#define ggml_assert_aligned(ptr) \
GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
if (buffer == NULL) {
return;

5
llama/ggml-cuda.cu vendored
View File

@ -715,9 +715,6 @@ static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
delete ctx;
// HACK: this needs to be freed in msvc
free(buffer);
}
GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@ -3031,7 +3028,7 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params,
GGML_UNUSED(params);
}
// extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
GGML_CALL int ggml_backend_cuda_reg_devices() {
int device_count = ggml_backend_cuda_get_device_count();

2
llama/ggml-cuda.h vendored
View File

@ -31,8 +31,6 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_typ
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
GGML_API GGML_CALL int ggml_backend_cuda_reg_devices();
GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);

6859
llama/ggml-metal.metal Normal file

File diff suppressed because it is too large Load Diff

BIN
llama/ggml-metal.o Normal file

Binary file not shown.

View File

@ -3,14 +3,13 @@ package llama
// #cgo darwin,arm64 CFLAGS: -std=c11 -DGGML_USE_METAL -DGGML_METAL_EMBED_LIBRARY -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
// #cgo darwin,arm64 CXXFLAGS: -std=c++11 -DGGML_USE_METAL -DGGML_METAL_EMBED_LIBRARY -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
// #cgo darwin,amd64 CXXFLAGS: -std=c++11
// #cgo darwin,arm64 LDFLAGS: ggml-metal.o -framework Foundation -framework Metal -framework MetalKit -framework Accelerate
// #cgo darwin,amd64 LDFLAGS: -framework Foundation -framework Accelerate
// #cgo darwin,arm64 LDFLAGS: -ld_classic ${SRCDIR}/ggml-metal.o -framework Foundation -framework Metal -framework MetalKit -framework Accelerate
// #cgo darwin,amd64 LDFLAGS: -ld_classic -framework Foundation -framework Accelerate
// #cgo windows LDFLAGS: -lmsvcrt
// #cgo avx CFLAGS: -mavx
// #cgo avx CXXFLAGS: -mavx
// #cgo avx2 CFLAGS: -mavx -mavx2 -mfma
// #cgo avx2 CXXFLAGS: -mavx -mavx2 -mfma
// #cgo avx2 LDFLAGS: -lm
// #cgo cuda CFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_MULTIPLATFORM -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
// #cgo cuda CXXFLAGS: -std=c++11 -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_MULTIPLATFORM -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
// #cgo rocm CXXFLAGS: -std=c++11 -DGGML_USE_CUDA -DGGML_USE_HIPBLAS -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_MULTIPLATFORM -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
@ -24,6 +23,8 @@ import (
"runtime"
"strings"
"unsafe"
"github.com/ollama/ollama/llm"
)
type Token int32
@ -201,3 +202,21 @@ func (m *Model) Tokenize(text string, maxTokens int, addSpecial bool, parseSpeci
return tokens, nil
}
func Quantize(infile, outfile string, ftype llm.FileType) error {
cinfile := C.CString(infile)
defer C.free(unsafe.Pointer(cinfile))
coutfile := C.CString(outfile)
defer C.free(unsafe.Pointer(coutfile))
params := C.llama_model_quantize_default_params()
params.nthread = -1
params.ftype = ftype.Value()
if rc := C.llama_model_quantize(cinfile, coutfile, &params); rc != 0 {
return fmt.Errorf("llama_model_quantize: %d", rc)
}
return nil
}

View File

@ -1,11 +0,0 @@
sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal
TEMP_ASSEMBLY=$(mktemp)
echo ".section __DATA, __ggml_metallib" > $TEMP_ASSEMBLY
echo ".globl _ggml_metallib_start" >> $TEMP_ASSEMBLY
echo "_ggml_metallib_start:" >> $TEMP_ASSEMBLY
echo ".incbin \"ggml-metal-embed.metal\"" >> $TEMP_ASSEMBLY
echo ".globl _ggml_metallib_end" >> $TEMP_ASSEMBLY
echo "_ggml_metallib_end:" >> $TEMP_ASSEMBLY
as -mmacosx-version-min=11.3 $TEMP_ASSEMBLY -o ggml-metal.o
rm -f $TEMP_ASSEMBLY
rm -rf ggml-metal-embed.metal

View File

@ -5,5 +5,5 @@
```
```
curl POST -H "Content-Type: application/json" -d '{"prompt": "hi"}' http://localhost:8080/
curl -X POST -H "Content-Type: application/json" -d '{"prompt": "hi"}' http://localhost:8080/
```

View File

@ -23,29 +23,9 @@ type Response struct {
type Server struct {
model *llama.Model
lc *llama.Context
batch *llama.Batch
queue chan Sequence
seqs []*Sequence
// mu guards seqs
mu sync.Mutex
}
type Sequence struct {
prompt []llama.Token
out chan string
}
func schedule(parallel int, queue <-chan Sequence) {
// Fill sequences from the queue
// once a sequence finishes, remove it from and add a new one from the queue
}
func process() {
// loop through the sequences, fill a batch, decode and sample tokens, responding to appropriate requests
}
var mu sync.Mutex
func (s *Server) stream(w http.ResponseWriter, r *http.Request) {
var request Request
@ -59,23 +39,15 @@ func (s *Server) stream(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Transfer-Encoding", "chunked")
w.WriteHeader(http.StatusOK)
enc := json.NewEncoder(w)
// main loop
tokens, err := s.model.Tokenize(request.Prompt, 2048, true, true)
if err != nil {
panic(err)
}
seq := Sequence{prompt: tokens}
s.queue <- seq
// listen for the sequence to finish
for {
str := <-seq.out
if err := json.NewEncoder(w).Encode(&Response{Token: str}); err != nil {
log.Println("Failed to encode result:", err)
return
}
w.(http.Flusher).Flush()
}
batch := llama.NewBatch(512, 0, 1)
// prompt eval
for i, t := range tokens {
@ -115,7 +87,6 @@ func (s *Server) stream(w http.ResponseWriter, r *http.Request) {
func main() {
mp := flag.String("model", "", "Path to model binary file")
parallel := flag.Int("parallel", 1, "Number of parallel requests to handle")
flag.Parse()
// load the model
@ -131,8 +102,6 @@ func main() {
server := &Server{
model: model,
lc: lc,
queue: make(chan Sequence, 256),
seqs: make([]*Sequence, *parallel),
}
addr := "127.0.0.1:8080"

View File

@ -23,7 +23,7 @@ cp $src_dir/ggml-quants.c $dst_dir/ggml-quants.c
cp $src_dir/ggml-quants.h $dst_dir/ggml-quants.h
cp $src_dir/ggml-metal.metal $dst_dir/ggml-metal.metal
cp $src_dir/ggml-metal.h $dst_dir/ggml-metal.h
cp $src_dir/ggml-metal.m $dst_dir/ggml-metal-darwin_arm64.m
cp $src_dir/ggml-metal.m $dst_dir/ggml-metal.m
cp $src_dir/ggml-impl.h $dst_dir/ggml-impl.h
cp $src_dir/ggml-cuda.h $dst_dir/ggml-cuda.h
cp $src_dir/ggml-cuda.cu $dst_dir/ggml-cuda.cu
@ -34,11 +34,23 @@ cp $src_dir/ggml-backend-impl.h $dst_dir/ggml-backend-impl.h
cp $src_dir/ggml-alloc.h $dst_dir/ggml-alloc.h
cp $src_dir/ggml-alloc.c $dst_dir/ggml-alloc.c
sed -i 's/extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();/\/\/ extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();/' ggml-cuda.cu
sed -i '34iGGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);' ggml-cuda.h
# ggml-cuda
mkdir -p $dst_dir/ggml-cuda
cp $src_dir/ggml-cuda/*.cu $dst_dir/ggml-cuda/
cp $src_dir/ggml-cuda/*.cuh $dst_dir/ggml-cuda/
sed -i 's/extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();/\/\/ extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();/' ggml-cuda.cu
sed -i '34iGGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);' ggml-cuda.h
# ggml-metal
sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > temp.metal
TEMP_ASSEMBLY=$(mktemp)
echo ".section __DATA, __ggml_metallib" > $TEMP_ASSEMBLY
echo ".globl _ggml_metallib_start" >> $TEMP_ASSEMBLY
echo "_ggml_metallib_start:" >> $TEMP_ASSEMBLY
echo ".incbin \"temp.metal\"" >> $TEMP_ASSEMBLY
echo ".globl _ggml_metallib_end" >> $TEMP_ASSEMBLY
echo "_ggml_metallib_end:" >> $TEMP_ASSEMBLY
as -mmacosx-version-min=11.3 $TEMP_ASSEMBLY -o ggml-metal.o
rm -f $TEMP_ASSEMBLY
rm -rf temp.metal

View File

@ -2,10 +2,10 @@ package llm
import "fmt"
type fileType uint32
type FileType uint32
const (
fileTypeF32 fileType = iota
fileTypeF32 FileType = iota
fileTypeF16
fileTypeQ4_0
fileTypeQ4_1
@ -41,7 +41,7 @@ const (
fileTypeUnknown
)
func ParseFileType(s string) (fileType, error) {
func ParseFileType(s string) (FileType, error) {
switch s {
case "F32":
return fileTypeF32, nil
@ -108,7 +108,7 @@ func ParseFileType(s string) (fileType, error) {
}
}
func (t fileType) String() string {
func (t FileType) String() string {
switch t {
case fileTypeF32:
return "F32"
@ -175,6 +175,6 @@ func (t fileType) String() string {
}
}
func (t fileType) Value() uint32 {
func (t FileType) Value() uint32 {
return uint32(t)
}

View File

@ -58,19 +58,6 @@ init_vars
git_module_setup
apply_patches
init_vars
if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; then
# Builds by default, allows skipping, forces build if OLLAMA_CPU_TARGET="static"
# Enables optimized Dockerfile builds using a blanket skip and targeted overrides
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS="--target llama --target ggml"
CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}_static"
echo "Building static library"
build
fi
init_vars
if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# Users building from source can tune the exact flags we pass to cmake for configuring

View File

@ -177,40 +177,6 @@ function cleanup {
# -DGGML_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -DGGML_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
function build_static() {
if ((-not "${env:OLLAMA_SKIP_STATIC_GENERATE}") -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "static"))) {
# GCC build for direct linking into the Go binary
init_vars
# cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
# as we need this to be compiled by gcc for golang to be able to link with itx
write-host "Checking for MinGW..."
# error action ensures we exit on failure
get-command gcc
get-command mingw32-make
$oldTargets = $script:cmakeTargets
$script:cmakeTargets = @("llama", "ggml")
$script:cmakeDefs = @(
"-G", "MinGW Makefiles"
"-DCMAKE_C_COMPILER=gcc.exe",
"-DCMAKE_CXX_COMPILER=g++.exe",
"-DBUILD_SHARED_LIBS=off",
"-DGGML_NATIVE=off",
"-DGGML_AVX=off",
"-DGGML_AVX2=off",
"-DGGML_AVX512=off",
"-DGGML_F16C=off",
"-DGGML_FMA=off",
"-DGGML_OPENMP=off")
$script:buildDir="../build/windows/${script:ARCH}_static"
write-host "Building static library"
build
$script:cmakeTargets = $oldTargets
} else {
write-host "Skipping CPU generation step as requested"
}
}
function build_cpu($gen_arch) {
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
# remaining llama.cpp builds use MSVC
@ -398,7 +364,6 @@ init_vars
if ($($args.count) -eq 0) {
git_module_setup
apply_patches
build_static
if ($script:ARCH -eq "arm64") {
build_cpu("ARM64")
} else { # amd64

View File

@ -55,9 +55,9 @@ func (kv KV) ParameterCount() uint64 {
return kv.u64("general.parameter_count")
}
func (kv KV) FileType() fileType {
func (kv KV) FileType() FileType {
if u64 := kv.u64("general.file_type"); u64 > 0 {
return fileType(uint32(u64))
return FileType(uint32(u64))
}
return fileTypeUnknown

View File

@ -1,41 +0,0 @@
package llm
// #cgo CFLAGS: -Illama.cpp -Illama.cpp/include -Illama.cpp/ggml/include
// #cgo LDFLAGS: -lllama -lggml -lstdc++ -lpthread
// #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal
// #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src
// #cgo windows,amd64 LDFLAGS: -static-libstdc++ -static-libgcc -static -L${SRCDIR}/build/windows/amd64_static -L${SRCDIR}/build/windows/amd64_static/src -L${SRCDIR}/build/windows/amd64_static/ggml/src
// #cgo windows,arm64 LDFLAGS: -static-libstdc++ -static-libgcc -static -L${SRCDIR}/build/windows/arm64_static -L${SRCDIR}/build/windows/arm64_static/src -L${SRCDIR}/build/windows/arm64_static/ggml/src
// #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/linux/x86_64_static -L${SRCDIR}/build/linux/x86_64_static/src -L${SRCDIR}/build/linux/x86_64_static/ggml/src
// #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/linux/arm64_static -L${SRCDIR}/build/linux/arm64_static/src -L${SRCDIR}/build/linux/arm64_static/ggml/src
// #include <stdlib.h>
// #include "llama.h"
import "C"
import (
"errors"
"unsafe"
)
// SystemInfo is an unused example of calling llama.cpp functions using CGo
func SystemInfo() string {
return C.GoString(C.llama_print_system_info())
}
func Quantize(infile, outfile string, ftype fileType) error {
cinfile := C.CString(infile)
defer C.free(unsafe.Pointer(cinfile))
coutfile := C.CString(outfile)
defer C.free(unsafe.Pointer(coutfile))
params := C.llama_model_quantize_default_params()
params.nthread = -1
params.ftype = ftype.Value()
if rc := C.llama_model_quantize(cinfile, coutfile, &params); rc != 0 {
return errors.New("failed to quantize model. This model architecture may not be supported, or you may need to upgrade Ollama to the latest version")
}
return nil
}

View File

@ -26,6 +26,7 @@ import (
"github.com/ollama/ollama/auth"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/llama"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/parser"
"github.com/ollama/ollama/template"
@ -453,7 +454,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
defer temp.Close()
defer os.Remove(temp.Name())
if err := llm.Quantize(blob, temp.Name(), want); err != nil {
if err := llama.Quantize(blob, temp.Name(), want); err != nil {
return err
}