forked from third-party-mirrors/ollama
This adjusts the new runners to comingle with existing runners so we can use an env var to toggle the new runners on.
106 lines
3.7 KiB
Go
106 lines
3.7 KiB
Go
package llm
|
|
|
|
// #cgo CFLAGS: -Illama.cpp -Illama.cpp/include -Illama.cpp/ggml/include
|
|
// #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/src/libllama.a ${SRCDIR}/build/darwin/arm64_static/ggml/src/libggml.a -framework Accelerate -lstdc++
|
|
// #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/src/libllama.a ${SRCDIR}/build/darwin/x86_64_static/ggml/src/libggml.a -framework Accelerate -lstdc++
|
|
// #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/src/libllama.a ${SRCDIR}/build/windows/amd64_static/ggml/src/libggml.a -static -lstdc++
|
|
// #cgo windows,arm64 LDFLAGS: ${SRCDIR}/build/windows/arm64_static/src/libllama.a ${SRCDIR}/build/windows/arm64_static/ggml/src/libggml.a -static -lstdc++
|
|
// #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/src/libllama.a ${SRCDIR}/build/linux/x86_64_static/ggml/src/libggml.a -lstdc++
|
|
// #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/src/libllama.a ${SRCDIR}/build/linux/arm64_static/ggml/src/libggml.a -lstdc++
|
|
// #include <stdlib.h>
|
|
// #include "llama.h"
|
|
import "C"
|
|
import (
|
|
"fmt"
|
|
"log/slog"
|
|
"unsafe"
|
|
)
|
|
|
|
// SystemInfo is an unused example of calling llama.cpp functions using CGo
|
|
func SystemInfo() string {
|
|
return C.GoString(C.llama_print_system_info())
|
|
}
|
|
|
|
func Quantize(infile, outfile string, ftype fileType) error {
|
|
cinfile := C.CString(infile)
|
|
defer C.free(unsafe.Pointer(cinfile))
|
|
|
|
coutfile := C.CString(outfile)
|
|
defer C.free(unsafe.Pointer(coutfile))
|
|
|
|
params := C.llama_model_quantize_default_params()
|
|
params.nthread = -1
|
|
params.ftype = ftype.Value()
|
|
|
|
if rc := C.llama_model_quantize(cinfile, coutfile, ¶ms); rc != 0 {
|
|
return fmt.Errorf("llama_model_quantize: %d", rc)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
type loadedModel struct {
|
|
model *C.struct_llama_model
|
|
}
|
|
|
|
func loadModel(modelfile string, vocabOnly bool) (*loadedModel, error) {
|
|
// TODO figure out how to quiet down the logging so we don't have 2 copies of the model metadata showing up
|
|
slog.Info("XXX initializing default model params")
|
|
params := C.llama_model_default_params()
|
|
params.vocab_only = C.bool(vocabOnly)
|
|
|
|
cmodelfile := C.CString(modelfile)
|
|
defer C.free(unsafe.Pointer(cmodelfile))
|
|
|
|
slog.Info("XXX loading model", "model", modelfile)
|
|
model := C.llama_load_model_from_file(cmodelfile, params)
|
|
if model == nil {
|
|
return nil, fmt.Errorf("failed to load model %s", modelfile)
|
|
}
|
|
return &loadedModel{model}, nil
|
|
}
|
|
|
|
func freeModel(model *loadedModel) {
|
|
C.llama_free_model(model.model)
|
|
}
|
|
|
|
func tokenize(model *loadedModel, content string) ([]int, error) {
|
|
ccontent := C.CString(content)
|
|
defer C.free(unsafe.Pointer(ccontent))
|
|
|
|
len := len(content) + 2
|
|
tokens := make([]C.int32_t, len)
|
|
|
|
tokenCount := C.llama_tokenize(model.model, ccontent, C.int32_t(len), &tokens[0], C.int32_t(len), true, true)
|
|
if tokenCount < 0 {
|
|
slog.Info("XXX got negative response", "count", tokenCount)
|
|
tokens = make([]C.int32_t, int(tokenCount))
|
|
tokenCount = C.llama_tokenize(model.model, ccontent, C.int32_t(len), &tokens[0], tokenCount, true, true)
|
|
} else if tokenCount == 0 {
|
|
return nil, nil
|
|
}
|
|
ret := make([]int, tokenCount)
|
|
for i := range int(tokenCount) {
|
|
ret[i] = int(tokens[i])
|
|
}
|
|
slog.Debug("XXX tokenized", "tokens", tokens, "content", content)
|
|
return ret, nil
|
|
}
|
|
|
|
func detokenize(model *loadedModel, tokens []int) string {
|
|
slog.Info("XXX in CGO detokenize")
|
|
var resp string
|
|
for _, token := range tokens {
|
|
buf := make([]C.char, 8)
|
|
nTokens := C.llama_token_to_piece(model.model, C.int(token), &buf[0], 8, 0, true)
|
|
if nTokens < 0 {
|
|
buf = make([]C.char, -nTokens)
|
|
nTokens = C.llama_token_to_piece(model.model, C.int(token), &buf[0], -nTokens, 0, true)
|
|
}
|
|
tokString := C.GoStringN(&buf[0], nTokens)
|
|
resp += tokString
|
|
}
|
|
slog.Debug("XXX detokenized", "tokens", tokens, "content", resp)
|
|
return resp
|
|
}
|