forked from third-party-mirrors/ollama
replace static build in llm
This commit is contained in:
parent
ec09be97e8
commit
01ccbc07fe
1
.gitignore
vendored
1
.gitignore
vendored
@ -5,7 +5,6 @@
|
||||
.swp
|
||||
dist
|
||||
ollama
|
||||
ggml-metal.metal
|
||||
.cache
|
||||
*.exe
|
||||
.idea
|
||||
|
5
llama/ggml-backend.c
vendored
5
llama/ggml-backend.c
vendored
@ -56,6 +56,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
||||
}
|
||||
|
||||
// backend buffer
|
||||
|
||||
GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
ggml_backend_buffer_type_t buft,
|
||||
struct ggml_backend_buffer_i iface,
|
||||
@ -78,10 +79,6 @@ const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
|
||||
return buffer->iface.get_name(buffer);
|
||||
}
|
||||
|
||||
#define ggml_assert_aligned(ptr) \
|
||||
GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
|
||||
|
||||
|
||||
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
if (buffer == NULL) {
|
||||
return;
|
||||
|
5
llama/ggml-cuda.cu
vendored
5
llama/ggml-cuda.cu
vendored
@ -715,9 +715,6 @@ static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
|
||||
GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
||||
delete ctx;
|
||||
|
||||
// HACK: this needs to be freed in msvc
|
||||
free(buffer);
|
||||
}
|
||||
|
||||
GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@ -3031,7 +3028,7 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params,
|
||||
GGML_UNUSED(params);
|
||||
}
|
||||
|
||||
// extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
|
||||
extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
|
||||
|
||||
GGML_CALL int ggml_backend_cuda_reg_devices() {
|
||||
int device_count = ggml_backend_cuda_get_device_count();
|
||||
|
2
llama/ggml-cuda.h
vendored
2
llama/ggml-cuda.h
vendored
@ -31,8 +31,6 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_typ
|
||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
||||
|
||||
GGML_API GGML_CALL int ggml_backend_cuda_reg_devices();
|
||||
|
||||
GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
|
||||
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
||||
|
6859
llama/ggml-metal.metal
Normal file
6859
llama/ggml-metal.metal
Normal file
File diff suppressed because it is too large
Load Diff
BIN
llama/ggml-metal.o
Normal file
BIN
llama/ggml-metal.o
Normal file
Binary file not shown.
@ -3,14 +3,13 @@ package llama
|
||||
// #cgo darwin,arm64 CFLAGS: -std=c11 -DGGML_USE_METAL -DGGML_METAL_EMBED_LIBRARY -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
|
||||
// #cgo darwin,arm64 CXXFLAGS: -std=c++11 -DGGML_USE_METAL -DGGML_METAL_EMBED_LIBRARY -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
|
||||
// #cgo darwin,amd64 CXXFLAGS: -std=c++11
|
||||
// #cgo darwin,arm64 LDFLAGS: ggml-metal.o -framework Foundation -framework Metal -framework MetalKit -framework Accelerate
|
||||
// #cgo darwin,amd64 LDFLAGS: -framework Foundation -framework Accelerate
|
||||
// #cgo darwin,arm64 LDFLAGS: -ld_classic ${SRCDIR}/ggml-metal.o -framework Foundation -framework Metal -framework MetalKit -framework Accelerate
|
||||
// #cgo darwin,amd64 LDFLAGS: -ld_classic -framework Foundation -framework Accelerate
|
||||
// #cgo windows LDFLAGS: -lmsvcrt
|
||||
// #cgo avx CFLAGS: -mavx
|
||||
// #cgo avx CXXFLAGS: -mavx
|
||||
// #cgo avx2 CFLAGS: -mavx -mavx2 -mfma
|
||||
// #cgo avx2 CXXFLAGS: -mavx -mavx2 -mfma
|
||||
// #cgo avx2 LDFLAGS: -lm
|
||||
// #cgo cuda CFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_MULTIPLATFORM -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
||||
// #cgo cuda CXXFLAGS: -std=c++11 -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_MULTIPLATFORM -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
||||
// #cgo rocm CXXFLAGS: -std=c++11 -DGGML_USE_CUDA -DGGML_USE_HIPBLAS -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_MULTIPLATFORM -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
||||
@ -24,6 +23,8 @@ import (
|
||||
"runtime"
|
||||
"strings"
|
||||
"unsafe"
|
||||
|
||||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
type Token int32
|
||||
@ -201,3 +202,21 @@ func (m *Model) Tokenize(text string, maxTokens int, addSpecial bool, parseSpeci
|
||||
|
||||
return tokens, nil
|
||||
}
|
||||
|
||||
func Quantize(infile, outfile string, ftype llm.FileType) error {
|
||||
cinfile := C.CString(infile)
|
||||
defer C.free(unsafe.Pointer(cinfile))
|
||||
|
||||
coutfile := C.CString(outfile)
|
||||
defer C.free(unsafe.Pointer(coutfile))
|
||||
|
||||
params := C.llama_model_quantize_default_params()
|
||||
params.nthread = -1
|
||||
params.ftype = ftype.Value()
|
||||
|
||||
if rc := C.llama_model_quantize(cinfile, coutfile, ¶ms); rc != 0 {
|
||||
return fmt.Errorf("llama_model_quantize: %d", rc)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
@ -1,11 +0,0 @@
|
||||
sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal
|
||||
TEMP_ASSEMBLY=$(mktemp)
|
||||
echo ".section __DATA, __ggml_metallib" > $TEMP_ASSEMBLY
|
||||
echo ".globl _ggml_metallib_start" >> $TEMP_ASSEMBLY
|
||||
echo "_ggml_metallib_start:" >> $TEMP_ASSEMBLY
|
||||
echo ".incbin \"ggml-metal-embed.metal\"" >> $TEMP_ASSEMBLY
|
||||
echo ".globl _ggml_metallib_end" >> $TEMP_ASSEMBLY
|
||||
echo "_ggml_metallib_end:" >> $TEMP_ASSEMBLY
|
||||
as -mmacosx-version-min=11.3 $TEMP_ASSEMBLY -o ggml-metal.o
|
||||
rm -f $TEMP_ASSEMBLY
|
||||
rm -rf ggml-metal-embed.metal
|
@ -5,5 +5,5 @@
|
||||
```
|
||||
|
||||
```
|
||||
curl POST -H "Content-Type: application/json" -d '{"prompt": "hi"}' http://localhost:8080/
|
||||
curl -X POST -H "Content-Type: application/json" -d '{"prompt": "hi"}' http://localhost:8080/
|
||||
```
|
||||
|
@ -23,29 +23,9 @@ type Response struct {
|
||||
type Server struct {
|
||||
model *llama.Model
|
||||
lc *llama.Context
|
||||
batch *llama.Batch
|
||||
|
||||
queue chan Sequence
|
||||
seqs []*Sequence
|
||||
|
||||
// mu guards seqs
|
||||
mu sync.Mutex
|
||||
}
|
||||
|
||||
type Sequence struct {
|
||||
prompt []llama.Token
|
||||
out chan string
|
||||
}
|
||||
|
||||
func schedule(parallel int, queue <-chan Sequence) {
|
||||
// Fill sequences from the queue
|
||||
|
||||
// once a sequence finishes, remove it from and add a new one from the queue
|
||||
}
|
||||
|
||||
func process() {
|
||||
// loop through the sequences, fill a batch, decode and sample tokens, responding to appropriate requests
|
||||
}
|
||||
var mu sync.Mutex
|
||||
|
||||
func (s *Server) stream(w http.ResponseWriter, r *http.Request) {
|
||||
var request Request
|
||||
@ -59,23 +39,15 @@ func (s *Server) stream(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Transfer-Encoding", "chunked")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
|
||||
enc := json.NewEncoder(w)
|
||||
|
||||
// main loop
|
||||
tokens, err := s.model.Tokenize(request.Prompt, 2048, true, true)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
seq := Sequence{prompt: tokens}
|
||||
s.queue <- seq
|
||||
|
||||
// listen for the sequence to finish
|
||||
for {
|
||||
str := <-seq.out
|
||||
if err := json.NewEncoder(w).Encode(&Response{Token: str}); err != nil {
|
||||
log.Println("Failed to encode result:", err)
|
||||
return
|
||||
}
|
||||
w.(http.Flusher).Flush()
|
||||
}
|
||||
batch := llama.NewBatch(512, 0, 1)
|
||||
|
||||
// prompt eval
|
||||
for i, t := range tokens {
|
||||
@ -115,7 +87,6 @@ func (s *Server) stream(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
func main() {
|
||||
mp := flag.String("model", "", "Path to model binary file")
|
||||
parallel := flag.Int("parallel", 1, "Number of parallel requests to handle")
|
||||
flag.Parse()
|
||||
|
||||
// load the model
|
||||
@ -131,8 +102,6 @@ func main() {
|
||||
server := &Server{
|
||||
model: model,
|
||||
lc: lc,
|
||||
queue: make(chan Sequence, 256),
|
||||
seqs: make([]*Sequence, *parallel),
|
||||
}
|
||||
|
||||
addr := "127.0.0.1:8080"
|
@ -23,7 +23,7 @@ cp $src_dir/ggml-quants.c $dst_dir/ggml-quants.c
|
||||
cp $src_dir/ggml-quants.h $dst_dir/ggml-quants.h
|
||||
cp $src_dir/ggml-metal.metal $dst_dir/ggml-metal.metal
|
||||
cp $src_dir/ggml-metal.h $dst_dir/ggml-metal.h
|
||||
cp $src_dir/ggml-metal.m $dst_dir/ggml-metal-darwin_arm64.m
|
||||
cp $src_dir/ggml-metal.m $dst_dir/ggml-metal.m
|
||||
cp $src_dir/ggml-impl.h $dst_dir/ggml-impl.h
|
||||
cp $src_dir/ggml-cuda.h $dst_dir/ggml-cuda.h
|
||||
cp $src_dir/ggml-cuda.cu $dst_dir/ggml-cuda.cu
|
||||
@ -34,11 +34,23 @@ cp $src_dir/ggml-backend-impl.h $dst_dir/ggml-backend-impl.h
|
||||
cp $src_dir/ggml-alloc.h $dst_dir/ggml-alloc.h
|
||||
cp $src_dir/ggml-alloc.c $dst_dir/ggml-alloc.c
|
||||
|
||||
sed -i 's/extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();/\/\/ extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();/' ggml-cuda.cu
|
||||
sed -i '34iGGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);' ggml-cuda.h
|
||||
|
||||
|
||||
# ggml-cuda
|
||||
mkdir -p $dst_dir/ggml-cuda
|
||||
cp $src_dir/ggml-cuda/*.cu $dst_dir/ggml-cuda/
|
||||
cp $src_dir/ggml-cuda/*.cuh $dst_dir/ggml-cuda/
|
||||
|
||||
sed -i 's/extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();/\/\/ extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();/' ggml-cuda.cu
|
||||
sed -i '34iGGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);' ggml-cuda.h
|
||||
|
||||
# ggml-metal
|
||||
sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > temp.metal
|
||||
TEMP_ASSEMBLY=$(mktemp)
|
||||
echo ".section __DATA, __ggml_metallib" > $TEMP_ASSEMBLY
|
||||
echo ".globl _ggml_metallib_start" >> $TEMP_ASSEMBLY
|
||||
echo "_ggml_metallib_start:" >> $TEMP_ASSEMBLY
|
||||
echo ".incbin \"temp.metal\"" >> $TEMP_ASSEMBLY
|
||||
echo ".globl _ggml_metallib_end" >> $TEMP_ASSEMBLY
|
||||
echo "_ggml_metallib_end:" >> $TEMP_ASSEMBLY
|
||||
as -mmacosx-version-min=11.3 $TEMP_ASSEMBLY -o ggml-metal.o
|
||||
rm -f $TEMP_ASSEMBLY
|
||||
rm -rf temp.metal
|
||||
|
@ -2,10 +2,10 @@ package llm
|
||||
|
||||
import "fmt"
|
||||
|
||||
type fileType uint32
|
||||
type FileType uint32
|
||||
|
||||
const (
|
||||
fileTypeF32 fileType = iota
|
||||
fileTypeF32 FileType = iota
|
||||
fileTypeF16
|
||||
fileTypeQ4_0
|
||||
fileTypeQ4_1
|
||||
@ -41,7 +41,7 @@ const (
|
||||
fileTypeUnknown
|
||||
)
|
||||
|
||||
func ParseFileType(s string) (fileType, error) {
|
||||
func ParseFileType(s string) (FileType, error) {
|
||||
switch s {
|
||||
case "F32":
|
||||
return fileTypeF32, nil
|
||||
@ -108,7 +108,7 @@ func ParseFileType(s string) (fileType, error) {
|
||||
}
|
||||
}
|
||||
|
||||
func (t fileType) String() string {
|
||||
func (t FileType) String() string {
|
||||
switch t {
|
||||
case fileTypeF32:
|
||||
return "F32"
|
||||
@ -175,6 +175,6 @@ func (t fileType) String() string {
|
||||
}
|
||||
}
|
||||
|
||||
func (t fileType) Value() uint32 {
|
||||
func (t FileType) Value() uint32 {
|
||||
return uint32(t)
|
||||
}
|
||||
|
@ -58,19 +58,6 @@ init_vars
|
||||
git_module_setup
|
||||
apply_patches
|
||||
|
||||
init_vars
|
||||
if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; then
|
||||
# Builds by default, allows skipping, forces build if OLLAMA_CPU_TARGET="static"
|
||||
# Enables optimized Dockerfile builds using a blanket skip and targeted overrides
|
||||
# Static build for linking into the Go binary
|
||||
init_vars
|
||||
CMAKE_TARGETS="--target llama --target ggml"
|
||||
CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off ${CMAKE_DEFS}"
|
||||
BUILD_DIR="../build/linux/${ARCH}_static"
|
||||
echo "Building static library"
|
||||
build
|
||||
fi
|
||||
|
||||
init_vars
|
||||
if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
||||
# Users building from source can tune the exact flags we pass to cmake for configuring
|
||||
|
@ -177,40 +177,6 @@ function cleanup {
|
||||
# -DGGML_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
|
||||
# -DGGML_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
|
||||
|
||||
|
||||
function build_static() {
|
||||
if ((-not "${env:OLLAMA_SKIP_STATIC_GENERATE}") -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "static"))) {
|
||||
# GCC build for direct linking into the Go binary
|
||||
init_vars
|
||||
# cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
|
||||
# as we need this to be compiled by gcc for golang to be able to link with itx
|
||||
write-host "Checking for MinGW..."
|
||||
# error action ensures we exit on failure
|
||||
get-command gcc
|
||||
get-command mingw32-make
|
||||
$oldTargets = $script:cmakeTargets
|
||||
$script:cmakeTargets = @("llama", "ggml")
|
||||
$script:cmakeDefs = @(
|
||||
"-G", "MinGW Makefiles"
|
||||
"-DCMAKE_C_COMPILER=gcc.exe",
|
||||
"-DCMAKE_CXX_COMPILER=g++.exe",
|
||||
"-DBUILD_SHARED_LIBS=off",
|
||||
"-DGGML_NATIVE=off",
|
||||
"-DGGML_AVX=off",
|
||||
"-DGGML_AVX2=off",
|
||||
"-DGGML_AVX512=off",
|
||||
"-DGGML_F16C=off",
|
||||
"-DGGML_FMA=off",
|
||||
"-DGGML_OPENMP=off")
|
||||
$script:buildDir="../build/windows/${script:ARCH}_static"
|
||||
write-host "Building static library"
|
||||
build
|
||||
$script:cmakeTargets = $oldTargets
|
||||
} else {
|
||||
write-host "Skipping CPU generation step as requested"
|
||||
}
|
||||
}
|
||||
|
||||
function build_cpu($gen_arch) {
|
||||
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
|
||||
# remaining llama.cpp builds use MSVC
|
||||
@ -398,7 +364,6 @@ init_vars
|
||||
if ($($args.count) -eq 0) {
|
||||
git_module_setup
|
||||
apply_patches
|
||||
build_static
|
||||
if ($script:ARCH -eq "arm64") {
|
||||
build_cpu("ARM64")
|
||||
} else { # amd64
|
||||
|
@ -55,9 +55,9 @@ func (kv KV) ParameterCount() uint64 {
|
||||
return kv.u64("general.parameter_count")
|
||||
}
|
||||
|
||||
func (kv KV) FileType() fileType {
|
||||
func (kv KV) FileType() FileType {
|
||||
if u64 := kv.u64("general.file_type"); u64 > 0 {
|
||||
return fileType(uint32(u64))
|
||||
return FileType(uint32(u64))
|
||||
}
|
||||
|
||||
return fileTypeUnknown
|
||||
|
41
llm/llm.go
41
llm/llm.go
@ -1,41 +0,0 @@
|
||||
package llm
|
||||
|
||||
// #cgo CFLAGS: -Illama.cpp -Illama.cpp/include -Illama.cpp/ggml/include
|
||||
// #cgo LDFLAGS: -lllama -lggml -lstdc++ -lpthread
|
||||
// #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal
|
||||
// #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src
|
||||
// #cgo windows,amd64 LDFLAGS: -static-libstdc++ -static-libgcc -static -L${SRCDIR}/build/windows/amd64_static -L${SRCDIR}/build/windows/amd64_static/src -L${SRCDIR}/build/windows/amd64_static/ggml/src
|
||||
// #cgo windows,arm64 LDFLAGS: -static-libstdc++ -static-libgcc -static -L${SRCDIR}/build/windows/arm64_static -L${SRCDIR}/build/windows/arm64_static/src -L${SRCDIR}/build/windows/arm64_static/ggml/src
|
||||
// #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/linux/x86_64_static -L${SRCDIR}/build/linux/x86_64_static/src -L${SRCDIR}/build/linux/x86_64_static/ggml/src
|
||||
// #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/linux/arm64_static -L${SRCDIR}/build/linux/arm64_static/src -L${SRCDIR}/build/linux/arm64_static/ggml/src
|
||||
// #include <stdlib.h>
|
||||
// #include "llama.h"
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
// SystemInfo is an unused example of calling llama.cpp functions using CGo
|
||||
func SystemInfo() string {
|
||||
return C.GoString(C.llama_print_system_info())
|
||||
}
|
||||
|
||||
func Quantize(infile, outfile string, ftype fileType) error {
|
||||
cinfile := C.CString(infile)
|
||||
defer C.free(unsafe.Pointer(cinfile))
|
||||
|
||||
coutfile := C.CString(outfile)
|
||||
defer C.free(unsafe.Pointer(coutfile))
|
||||
|
||||
params := C.llama_model_quantize_default_params()
|
||||
params.nthread = -1
|
||||
params.ftype = ftype.Value()
|
||||
|
||||
if rc := C.llama_model_quantize(cinfile, coutfile, ¶ms); rc != 0 {
|
||||
return errors.New("failed to quantize model. This model architecture may not be supported, or you may need to upgrade Ollama to the latest version")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
@ -26,6 +26,7 @@ import (
|
||||
"github.com/ollama/ollama/auth"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/llama"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/parser"
|
||||
"github.com/ollama/ollama/template"
|
||||
@ -453,7 +454,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
|
||||
defer temp.Close()
|
||||
defer os.Remove(temp.Name())
|
||||
|
||||
if err := llm.Quantize(blob, temp.Name(), want); err != nil {
|
||||
if err := llama.Quantize(blob, temp.Name(), want); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user