forked from third-party-mirrors/ollama
tokenize() passes a string length longer than the actual data into llama_tokenize(). This entire string length gets scanned in the C++ code despite there being a NULL terminator in the correct location (because it gets converted into std::string). The result is read of uninitialized memory, which depending on the contents of that memory fails the check for partial multi-byte UTF8 characters. In addition, if there is not enough space in the passed buffer for token output then llama_tokenize() returns the required space as a negative number. We should convert this to a positive number before reallocing. The first problem results in the following splat: libc++abi: terminating due to uncaught exception of type std::invalid_argument: failed to convert utf8 to codepoint SIGABRT: abort PC=0x193cd55f0 m=11 sigcode=0 signal arrived during cgo execution goroutine 27 gp=0x14000708700 m=11 mp=0x14000584908 [syscall]: runtime.cgocall(0x105549e68, 0x140000c6bf8) /opt/homebrew/Cellar/go/1.22.5/libexec/src/runtime/cgocall.go:157 +0x44 fp=0x140000c6bc0 sp=0x140000c6b80 pc=0x104b372c4 github.com/ollama/ollama/llm._Cfunc_llama_tokenize(0x15180f400, 0x152009a00, 0x5aa, 0x140002e8800, 0x5aa, 0x1, 0x1) _cgo_gotypes.go:270 +0x34 fp=0x140000c6bf0 sp=0x140000c6bc0 pc=0x104ef7664 github.com/ollama/ollama/llm.tokenize.func2(0x140001dd800?, 0x152009a00, 0x5aa, 0x1400012cdc0?) /Users/jesse/ollama/llm/llm.go:74 +0x8c fp=0x140000c6c50 sp=0x140000c6bf0 pc=0x104ef83cc github.com/ollama/ollama/llm.tokenize(0x140003f7da0, {0x140001dd800, 0x5a8}) /Users/jesse/ollama/llm/llm.go:74 +0xb4 fp=0x140000c6d90 sp=0x140000c6c50 pc=0x104ef7f94 github.com/ollama/ollama/llm.(*llmServer).Tokenize(0x140000c6df8?, {0x105516574?, 0x5a8?}, {0x140001dd800?, 0x140000c6d00?}) /Users/jesse/ollama/llm/server.go:963 +0x2c fp=0x140000c6dc0 sp=0x140000c6d90 pc=0x104ef6b6c github.com/ollama/ollama/llm.LlamaServer.Tokenize-fm({0x105e876f0?, 0x140001e5c70?}, {0x140001dd800?, 0x140000350e0?}) <autogenerated>:1 +0x50 fp=0x140000c6e00 sp=0x140000c6dc0 pc=0x105532fc0 github.com/ollama/ollama/server.chatPrompt({0x105e876f0, 0x140001e5c70}, 0x14000616480, 0x140000c7508, 0x1400013e000, {0x1400014e008, 0x7, 0x7}, {0x0, 0x0, ...}) /Users/jesse/ollama/server/prompt.go:36 +0x2a0 fp=0x140000c7100 sp=0x140000c6e00 pc=0x1055165a0 github.com/ollama/ollama/server.(*Server).ChatHandler(0x1400000e9c0, 0x1400011c100) /Users/jesse/ollama/server/routes.go:1340 +0x478 fp=0x140000c7610 sp=0x140000c7100 pc=0x105523318 github.com/ollama/ollama/server.(*Server).ChatHandler-fm(0x9?) <autogenerated>:1 +0x30 fp=0x140000c7630 sp=0x140000c7610 pc=0x105533130
114 lines
3.9 KiB
Go
114 lines
3.9 KiB
Go
package llm
|
|
|
|
// #cgo CFLAGS: -Illama.cpp -Illama.cpp/include -Illama.cpp/ggml/include
|
|
// #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/src/libllama.a ${SRCDIR}/build/darwin/arm64_static/ggml/src/libggml.a -framework Accelerate -lstdc++
|
|
// #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/src/libllama.a ${SRCDIR}/build/darwin/x86_64_static/ggml/src/libggml.a -framework Accelerate -lstdc++
|
|
// #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/src/libllama.a ${SRCDIR}/build/windows/amd64_static/ggml/src/libggml.a -static -lstdc++
|
|
// #cgo windows,arm64 LDFLAGS: ${SRCDIR}/build/windows/arm64_static/src/libllama.a ${SRCDIR}/build/windows/arm64_static/ggml/src/libggml.a -static -lstdc++
|
|
// #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/src/libllama.a ${SRCDIR}/build/linux/x86_64_static/ggml/src/libggml.a -lstdc++
|
|
// #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/src/libllama.a ${SRCDIR}/build/linux/arm64_static/ggml/src/libggml.a -lstdc++
|
|
// #include <stdlib.h>
|
|
// #include "llama.h"
|
|
import "C"
|
|
|
|
import (
|
|
"fmt"
|
|
"log/slog"
|
|
"unsafe"
|
|
)
|
|
|
|
// SystemInfo is an unused example of calling llama.cpp functions using CGo
|
|
func SystemInfo() string {
|
|
return C.GoString(C.llama_print_system_info())
|
|
}
|
|
|
|
func Quantize(infile, outfile string, ftype fileType) error {
|
|
cinfile := C.CString(infile)
|
|
defer C.free(unsafe.Pointer(cinfile))
|
|
|
|
coutfile := C.CString(outfile)
|
|
defer C.free(unsafe.Pointer(coutfile))
|
|
|
|
params := C.llama_model_quantize_default_params()
|
|
params.nthread = -1
|
|
params.ftype = ftype.Value()
|
|
|
|
if rc := C.llama_model_quantize(cinfile, coutfile, ¶ms); rc != 0 {
|
|
return fmt.Errorf("llama_model_quantize: %d", rc)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
type loadedModel struct {
|
|
model *C.struct_llama_model
|
|
}
|
|
|
|
func loadModel(modelfile string, vocabOnly bool) (*loadedModel, error) {
|
|
// TODO figure out how to quiet down the logging so we don't have 2 copies of the model metadata showing up
|
|
slog.Info("XXX initializing default model params")
|
|
params := C.llama_model_default_params()
|
|
params.vocab_only = C.bool(vocabOnly)
|
|
|
|
cmodelfile := C.CString(modelfile)
|
|
defer C.free(unsafe.Pointer(cmodelfile))
|
|
|
|
slog.Info("XXX loading model", "model", modelfile)
|
|
model := C.llama_load_model_from_file(cmodelfile, params)
|
|
if model == nil {
|
|
return nil, fmt.Errorf("failed to load model %s", modelfile)
|
|
}
|
|
return &loadedModel{model}, nil
|
|
}
|
|
|
|
func freeModel(model *loadedModel) {
|
|
C.llama_free_model(model.model)
|
|
}
|
|
|
|
func tokenize(model *loadedModel, content string) ([]int, error) {
|
|
ccontent := C.CString(content)
|
|
defer C.free(unsafe.Pointer(ccontent))
|
|
|
|
tokenCount := len(content) + 2
|
|
tokens := make([]C.int32_t, tokenCount)
|
|
|
|
tokenCount = int(C.llama_tokenize(model.model, ccontent, C.int32_t(len(content)),
|
|
&tokens[0], C.int32_t(tokenCount), true, true))
|
|
if tokenCount < 0 {
|
|
tokenCount = -tokenCount
|
|
slog.Info("XXX got negative response", "count", tokenCount)
|
|
tokens = make([]C.int32_t, tokenCount)
|
|
tokenCount = int(C.llama_tokenize(model.model, ccontent, C.int32_t(len(content)), &tokens[0],
|
|
C.int32_t(tokenCount), true, true))
|
|
|
|
if tokenCount < 0 {
|
|
return nil, fmt.Errorf("failed to tokenize: %d", tokenCount)
|
|
}
|
|
} else if tokenCount == 0 {
|
|
return nil, nil
|
|
}
|
|
ret := make([]int, tokenCount)
|
|
for i := range tokenCount {
|
|
ret[i] = int(tokens[i])
|
|
}
|
|
slog.Debug("XXX tokenized", "tokens", tokens, "content", content)
|
|
return ret, nil
|
|
}
|
|
|
|
func detokenize(model *loadedModel, tokens []int) string {
|
|
slog.Info("XXX in CGO detokenize")
|
|
var resp string
|
|
for _, token := range tokens {
|
|
buf := make([]C.char, 8)
|
|
nTokens := C.llama_token_to_piece(model.model, C.int(token), &buf[0], 8, 0, true)
|
|
if nTokens < 0 {
|
|
buf = make([]C.char, -nTokens)
|
|
nTokens = C.llama_token_to_piece(model.model, C.int(token), &buf[0], -nTokens, 0, true)
|
|
}
|
|
tokString := C.GoStringN(&buf[0], nTokens)
|
|
resp += tokString
|
|
}
|
|
slog.Debug("XXX detokenized", "tokens", tokens, "content", resp)
|
|
return resp
|
|
}
|