Move Go code out of llm package
This commit is contained in:
parent
c7cb0f0602
commit
4bbdbbcaef
@ -9,7 +9,7 @@ import (
|
|||||||
"log/slog"
|
"log/slog"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fileutils"
|
||||||
)
|
)
|
||||||
|
|
||||||
type ModelParameters struct {
|
type ModelParameters struct {
|
||||||
@ -27,8 +27,8 @@ type AdapterParameters struct {
|
|||||||
} `json:"lora_parameters"`
|
} `json:"lora_parameters"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ModelParameters) KV(t *Tokenizer) llm.KV {
|
func (ModelParameters) KV(t *Tokenizer) fileutils.KV {
|
||||||
kv := llm.KV{
|
kv := fileutils.KV{
|
||||||
"general.file_type": uint32(1),
|
"general.file_type": uint32(1),
|
||||||
"general.quantization_version": uint32(2),
|
"general.quantization_version": uint32(2),
|
||||||
"tokenizer.ggml.pre": t.Pre,
|
"tokenizer.ggml.pre": t.Pre,
|
||||||
@ -54,7 +54,7 @@ func (ModelParameters) KV(t *Tokenizer) llm.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p AdapterParameters) KV() llm.KV {
|
func (p AdapterParameters) KV() fileutils.KV {
|
||||||
var alpha float32
|
var alpha float32
|
||||||
if p.LoraParameters.Alpha == 0 {
|
if p.LoraParameters.Alpha == 0 {
|
||||||
alpha = float32(p.Alpha)
|
alpha = float32(p.Alpha)
|
||||||
@ -62,7 +62,7 @@ func (p AdapterParameters) KV() llm.KV {
|
|||||||
alpha = p.LoraParameters.Alpha
|
alpha = p.LoraParameters.Alpha
|
||||||
}
|
}
|
||||||
|
|
||||||
kv := llm.KV{
|
kv := fileutils.KV{
|
||||||
"adapter.lora.alpha": alpha,
|
"adapter.lora.alpha": alpha,
|
||||||
"adapter.type": "lora",
|
"adapter.type": "lora",
|
||||||
"general.file_type": uint32(1),
|
"general.file_type": uint32(1),
|
||||||
@ -79,19 +79,19 @@ func (ModelParameters) specialTokenTypes() []string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
|
func (ModelParameters) writeFile(ws io.WriteSeeker, kv fileutils.KV, ts []fileutils.Tensor) error {
|
||||||
return llm.WriteGGUF(ws, kv, ts)
|
return fileutils.WriteGGUF(ws, kv, ts)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
|
func (AdapterParameters) writeFile(ws io.WriteSeeker, kv fileutils.KV, ts []fileutils.Tensor) error {
|
||||||
return llm.WriteGGUF(ws, kv, ts)
|
return fileutils.WriteGGUF(ws, kv, ts)
|
||||||
}
|
}
|
||||||
|
|
||||||
type ModelConverter interface {
|
type ModelConverter interface {
|
||||||
// KV maps parameters to LLM key-values
|
// KV maps parameters to LLM key-values
|
||||||
KV(*Tokenizer) llm.KV
|
KV(*Tokenizer) fileutils.KV
|
||||||
// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
|
// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
|
||||||
Tensors([]Tensor) []llm.Tensor
|
Tensors([]Tensor) []fileutils.Tensor
|
||||||
// Replacements returns a list of string pairs to replace in tensor names.
|
// Replacements returns a list of string pairs to replace in tensor names.
|
||||||
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
||||||
Replacements() []string
|
Replacements() []string
|
||||||
@ -99,7 +99,7 @@ type ModelConverter interface {
|
|||||||
// specialTokenTypes returns any special token types the model uses
|
// specialTokenTypes returns any special token types the model uses
|
||||||
specialTokenTypes() []string
|
specialTokenTypes() []string
|
||||||
// writeFile writes the model to the provided io.WriteSeeker
|
// writeFile writes the model to the provided io.WriteSeeker
|
||||||
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
|
writeFile(io.WriteSeeker, fileutils.KV, []fileutils.Tensor) error
|
||||||
}
|
}
|
||||||
|
|
||||||
type moreParser interface {
|
type moreParser interface {
|
||||||
@ -108,17 +108,17 @@ type moreParser interface {
|
|||||||
|
|
||||||
type AdapterConverter interface {
|
type AdapterConverter interface {
|
||||||
// KV maps parameters to LLM key-values
|
// KV maps parameters to LLM key-values
|
||||||
KV(llm.KV) llm.KV
|
KV(fileutils.KV) fileutils.KV
|
||||||
// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
|
// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
|
||||||
Tensors([]Tensor) []llm.Tensor
|
Tensors([]Tensor) []fileutils.Tensor
|
||||||
// Replacements returns a list of string pairs to replace in tensor names.
|
// Replacements returns a list of string pairs to replace in tensor names.
|
||||||
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
||||||
Replacements() []string
|
Replacements() []string
|
||||||
|
|
||||||
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
|
writeFile(io.WriteSeeker, fileutils.KV, []fileutils.Tensor) error
|
||||||
}
|
}
|
||||||
|
|
||||||
func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
|
func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV fileutils.KV) error {
|
||||||
bts, err := fs.ReadFile(fsys, "adapter_config.json")
|
bts, err := fs.ReadFile(fsys, "adapter_config.json")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -8,7 +8,7 @@ import (
|
|||||||
"slices"
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fileutils"
|
||||||
)
|
)
|
||||||
|
|
||||||
type bertModel struct {
|
type bertModel struct {
|
||||||
@ -85,7 +85,7 @@ func (p *bertModel) parseMore(fsys fs.FS) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *bertModel) KV(t *Tokenizer) llm.KV {
|
func (p *bertModel) KV(t *Tokenizer) fileutils.KV {
|
||||||
kv := p.ModelParameters.KV(t)
|
kv := p.ModelParameters.KV(t)
|
||||||
kv["general.architecture"] = "bert"
|
kv["general.architecture"] = "bert"
|
||||||
kv["bert.attention.causal"] = false
|
kv["bert.attention.causal"] = false
|
||||||
@ -132,8 +132,8 @@ func (p *bertModel) KV(t *Tokenizer) llm.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
|
func (p *bertModel) Tensors(ts []Tensor) []fileutils.Tensor {
|
||||||
var out []llm.Tensor
|
var out []fileutils.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
if slices.Contains([]string{
|
if slices.Contains([]string{
|
||||||
"embeddings.position_ids",
|
"embeddings.position_ids",
|
||||||
@ -143,7 +143,7 @@ func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, fileutils.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -6,7 +6,7 @@ import (
|
|||||||
"github.com/pdevine/tensor"
|
"github.com/pdevine/tensor"
|
||||||
"github.com/pdevine/tensor/native"
|
"github.com/pdevine/tensor/native"
|
||||||
|
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fileutils"
|
||||||
)
|
)
|
||||||
|
|
||||||
type gemmaModel struct {
|
type gemmaModel struct {
|
||||||
@ -23,7 +23,7 @@ type gemmaModel struct {
|
|||||||
|
|
||||||
var _ ModelConverter = (*gemmaModel)(nil)
|
var _ ModelConverter = (*gemmaModel)(nil)
|
||||||
|
|
||||||
func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
|
func (p *gemmaModel) KV(t *Tokenizer) fileutils.KV {
|
||||||
kv := p.ModelParameters.KV(t)
|
kv := p.ModelParameters.KV(t)
|
||||||
kv["general.architecture"] = "gemma"
|
kv["general.architecture"] = "gemma"
|
||||||
kv["gemma.context_length"] = p.MaxPositionEmbeddings
|
kv["gemma.context_length"] = p.MaxPositionEmbeddings
|
||||||
@ -42,14 +42,14 @@ func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
|
func (p *gemmaModel) Tensors(ts []Tensor) []fileutils.Tensor {
|
||||||
var out []llm.Tensor
|
var out []fileutils.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
if strings.HasSuffix(t.Name(), "_norm.weight") {
|
if strings.HasSuffix(t.Name(), "_norm.weight") {
|
||||||
t.SetRepacker(p.addOne)
|
t.SetRepacker(p.addOne)
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, fileutils.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package convert
|
package convert
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fileutils"
|
||||||
)
|
)
|
||||||
|
|
||||||
type gemma2Model struct {
|
type gemma2Model struct {
|
||||||
@ -11,7 +11,7 @@ type gemma2Model struct {
|
|||||||
FinalLogitSoftcap float32 `json:"final_logit_softcapping"`
|
FinalLogitSoftcap float32 `json:"final_logit_softcapping"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
|
func (p *gemma2Model) KV(t *Tokenizer) fileutils.KV {
|
||||||
kv := p.ModelParameters.KV(t)
|
kv := p.ModelParameters.KV(t)
|
||||||
kv["general.architecture"] = "gemma2"
|
kv["general.architecture"] = "gemma2"
|
||||||
kv["gemma2.context_length"] = p.MaxPositionEmbeddings
|
kv["gemma2.context_length"] = p.MaxPositionEmbeddings
|
||||||
|
@ -6,7 +6,7 @@ import (
|
|||||||
"github.com/pdevine/tensor"
|
"github.com/pdevine/tensor"
|
||||||
"github.com/pdevine/tensor/native"
|
"github.com/pdevine/tensor/native"
|
||||||
|
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fileutils"
|
||||||
)
|
)
|
||||||
|
|
||||||
type gemma2Adapter struct {
|
type gemma2Adapter struct {
|
||||||
@ -15,14 +15,14 @@ type gemma2Adapter struct {
|
|||||||
|
|
||||||
var _ AdapterConverter = (*gemma2Adapter)(nil)
|
var _ AdapterConverter = (*gemma2Adapter)(nil)
|
||||||
|
|
||||||
func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV {
|
func (p *gemma2Adapter) KV(baseKV fileutils.KV) fileutils.KV {
|
||||||
kv := p.AdapterParameters.KV()
|
kv := p.AdapterParameters.KV()
|
||||||
kv["general.architecture"] = "gemma2"
|
kv["general.architecture"] = "gemma2"
|
||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
|
func (p *gemma2Adapter) Tensors(ts []Tensor) []fileutils.Tensor {
|
||||||
var out []llm.Tensor
|
var out []fileutils.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
shape := t.Shape()
|
shape := t.Shape()
|
||||||
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
||||||
@ -31,7 +31,7 @@ func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
|
|||||||
t.SetRepacker(p.repack)
|
t.SetRepacker(p.repack)
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, fileutils.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -9,7 +9,7 @@ import (
|
|||||||
"github.com/pdevine/tensor"
|
"github.com/pdevine/tensor"
|
||||||
"github.com/pdevine/tensor/native"
|
"github.com/pdevine/tensor/native"
|
||||||
|
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fileutils"
|
||||||
)
|
)
|
||||||
|
|
||||||
type llamaModel struct {
|
type llamaModel struct {
|
||||||
@ -46,7 +46,7 @@ type llamaModel struct {
|
|||||||
|
|
||||||
var _ ModelConverter = (*llamaModel)(nil)
|
var _ ModelConverter = (*llamaModel)(nil)
|
||||||
|
|
||||||
func (p *llamaModel) KV(t *Tokenizer) llm.KV {
|
func (p *llamaModel) KV(t *Tokenizer) fileutils.KV {
|
||||||
kv := p.ModelParameters.KV(t)
|
kv := p.ModelParameters.KV(t)
|
||||||
kv["general.architecture"] = "llama"
|
kv["general.architecture"] = "llama"
|
||||||
kv["llama.vocab_size"] = p.VocabSize
|
kv["llama.vocab_size"] = p.VocabSize
|
||||||
@ -120,11 +120,11 @@ func (p *llamaModel) KV(t *Tokenizer) llm.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
|
func (p *llamaModel) Tensors(ts []Tensor) []fileutils.Tensor {
|
||||||
var out []llm.Tensor
|
var out []fileutils.Tensor
|
||||||
|
|
||||||
if p.RopeScaling.factors != nil {
|
if p.RopeScaling.factors != nil {
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, fileutils.Tensor{
|
||||||
Name: "rope_freqs.weight",
|
Name: "rope_freqs.weight",
|
||||||
Kind: 0,
|
Kind: 0,
|
||||||
Shape: []uint64{uint64(len(p.RopeScaling.factors))},
|
Shape: []uint64{uint64(len(p.RopeScaling.factors))},
|
||||||
@ -138,7 +138,7 @@ func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
|
|||||||
t.SetRepacker(p.repack)
|
t.SetRepacker(p.repack)
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, fileutils.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -7,7 +7,7 @@ import (
|
|||||||
"github.com/pdevine/tensor"
|
"github.com/pdevine/tensor"
|
||||||
"github.com/pdevine/tensor/native"
|
"github.com/pdevine/tensor/native"
|
||||||
|
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fileutils"
|
||||||
)
|
)
|
||||||
|
|
||||||
type llamaAdapter struct {
|
type llamaAdapter struct {
|
||||||
@ -18,7 +18,7 @@ type llamaAdapter struct {
|
|||||||
|
|
||||||
var _ AdapterConverter = (*llamaAdapter)(nil)
|
var _ AdapterConverter = (*llamaAdapter)(nil)
|
||||||
|
|
||||||
func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
|
func (p *llamaAdapter) KV(baseKV fileutils.KV) fileutils.KV {
|
||||||
kv := p.AdapterParameters.KV()
|
kv := p.AdapterParameters.KV()
|
||||||
kv["general.architecture"] = "llama"
|
kv["general.architecture"] = "llama"
|
||||||
kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
|
kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
|
||||||
@ -29,8 +29,8 @@ func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
|
func (p *llamaAdapter) Tensors(ts []Tensor) []fileutils.Tensor {
|
||||||
var out []llm.Tensor
|
var out []fileutils.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
shape := t.Shape()
|
shape := t.Shape()
|
||||||
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
||||||
@ -41,7 +41,7 @@ func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
|
|||||||
t.SetRepacker(p.repack)
|
t.SetRepacker(p.repack)
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, fileutils.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: shape,
|
Shape: shape,
|
||||||
|
@ -6,7 +6,7 @@ import (
|
|||||||
"slices"
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fileutils"
|
||||||
)
|
)
|
||||||
|
|
||||||
type mixtralModel struct {
|
type mixtralModel struct {
|
||||||
@ -15,7 +15,7 @@ type mixtralModel struct {
|
|||||||
NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
|
NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
|
func (p *mixtralModel) KV(t *Tokenizer) fileutils.KV {
|
||||||
kv := p.llamaModel.KV(t)
|
kv := p.llamaModel.KV(t)
|
||||||
|
|
||||||
if p.NumLocalExperts > 0 {
|
if p.NumLocalExperts > 0 {
|
||||||
@ -29,7 +29,7 @@ func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
|
func (p *mixtralModel) Tensors(ts []Tensor) []fileutils.Tensor {
|
||||||
oldnew := []string{
|
oldnew := []string{
|
||||||
"model.layers", "blk",
|
"model.layers", "blk",
|
||||||
"w1", "ffn_gate_exps",
|
"w1", "ffn_gate_exps",
|
||||||
@ -56,10 +56,10 @@ func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
|
|||||||
return true
|
return true
|
||||||
})
|
})
|
||||||
|
|
||||||
var out []llm.Tensor
|
var out []fileutils.Tensor
|
||||||
for n, e := range experts {
|
for n, e := range experts {
|
||||||
// TODO(mxyng): sanity check experts
|
// TODO(mxyng): sanity check experts
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, fileutils.Tensor{
|
||||||
Name: n,
|
Name: n,
|
||||||
Kind: e[0].Kind(),
|
Kind: e[0].Kind(),
|
||||||
Shape: append([]uint64{uint64(len(e))}, e[0].Shape()...),
|
Shape: append([]uint64{uint64(len(e))}, e[0].Shape()...),
|
||||||
|
@ -8,7 +8,7 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fileutils"
|
||||||
)
|
)
|
||||||
|
|
||||||
type phi3Model struct {
|
type phi3Model struct {
|
||||||
@ -37,7 +37,7 @@ type phi3Model struct {
|
|||||||
|
|
||||||
var _ ModelConverter = (*phi3Model)(nil)
|
var _ ModelConverter = (*phi3Model)(nil)
|
||||||
|
|
||||||
func (p *phi3Model) KV(t *Tokenizer) llm.KV {
|
func (p *phi3Model) KV(t *Tokenizer) fileutils.KV {
|
||||||
kv := p.ModelParameters.KV(t)
|
kv := p.ModelParameters.KV(t)
|
||||||
kv["general.architecture"] = "phi3"
|
kv["general.architecture"] = "phi3"
|
||||||
kv["phi3.context_length"] = p.MaxPositionEmbeddings
|
kv["phi3.context_length"] = p.MaxPositionEmbeddings
|
||||||
@ -68,19 +68,19 @@ func (p *phi3Model) KV(t *Tokenizer) llm.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
|
func (p *phi3Model) Tensors(ts []Tensor) []fileutils.Tensor {
|
||||||
var addRopeFactors sync.Once
|
var addRopeFactors sync.Once
|
||||||
|
|
||||||
out := make([]llm.Tensor, 0, len(ts)+2)
|
out := make([]fileutils.Tensor, 0, len(ts)+2)
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
if strings.HasPrefix(t.Name(), "blk.0.") {
|
if strings.HasPrefix(t.Name(), "blk.0.") {
|
||||||
addRopeFactors.Do(func() {
|
addRopeFactors.Do(func() {
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, fileutils.Tensor{
|
||||||
Name: "rope_factors_long.weight",
|
Name: "rope_factors_long.weight",
|
||||||
Kind: 0,
|
Kind: 0,
|
||||||
Shape: []uint64{uint64(len(p.RopeScaling.LongFactor))},
|
Shape: []uint64{uint64(len(p.RopeScaling.LongFactor))},
|
||||||
WriterTo: p.RopeScaling.LongFactor,
|
WriterTo: p.RopeScaling.LongFactor,
|
||||||
}, llm.Tensor{
|
}, fileutils.Tensor{
|
||||||
Name: "rope_factors_short.weight",
|
Name: "rope_factors_short.weight",
|
||||||
Kind: 0,
|
Kind: 0,
|
||||||
Shape: []uint64{uint64(len(p.RopeScaling.ShortFactor))},
|
Shape: []uint64{uint64(len(p.RopeScaling.ShortFactor))},
|
||||||
@ -89,7 +89,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, fileutils.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -20,7 +20,7 @@ import (
|
|||||||
|
|
||||||
"golang.org/x/exp/maps"
|
"golang.org/x/exp/maps"
|
||||||
|
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fileutils"
|
||||||
)
|
)
|
||||||
|
|
||||||
type tensorData struct {
|
type tensorData struct {
|
||||||
@ -29,7 +29,7 @@ type tensorData struct {
|
|||||||
Shape []int `json:"shape"`
|
Shape []int `json:"shape"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
|
func convertFull(t *testing.T, fsys fs.FS) (*os.File, fileutils.KV, *fileutils.Tensors) {
|
||||||
t.Helper()
|
t.Helper()
|
||||||
|
|
||||||
f, err := os.CreateTemp(t.TempDir(), "f16")
|
f, err := os.CreateTemp(t.TempDir(), "f16")
|
||||||
@ -48,7 +48,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
|
|||||||
}
|
}
|
||||||
t.Cleanup(func() { r.Close() })
|
t.Cleanup(func() { r.Close() })
|
||||||
|
|
||||||
m, _, err := llm.DecodeGGML(r, math.MaxInt)
|
m, _, err := fileutils.DecodeGGML(r, math.MaxInt)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
@ -60,7 +60,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
|
|||||||
return r, m.KV(), m.Tensors()
|
return r, m.KV(), m.Tensors()
|
||||||
}
|
}
|
||||||
|
|
||||||
func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors *llm.Tensors) map[string]string {
|
func generateResultsJSON(t *testing.T, f *os.File, kv fileutils.KV, tensors *fileutils.Tensors) map[string]string {
|
||||||
actual := make(map[string]string)
|
actual := make(map[string]string)
|
||||||
for k, v := range kv {
|
for k, v := range kv {
|
||||||
if s, ok := v.(json.Marshaler); !ok {
|
if s, ok := v.(json.Marshaler); !ok {
|
||||||
@ -330,7 +330,7 @@ func TestConvertAdapter(t *testing.T) {
|
|||||||
}
|
}
|
||||||
defer r.Close()
|
defer r.Close()
|
||||||
|
|
||||||
m, _, err := llm.DecodeGGML(r, math.MaxInt)
|
m, _, err := fileutils.DecodeGGML(r, math.MaxInt)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
3
discover/README.md
Normal file
3
discover/README.md
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
# `discover`
|
||||||
|
|
||||||
|
This package is responsible for discovering information about the system and the capabilities to run LLM. This includes GPU and CPU discovery so the optimal runner can be chosen for a given model. The ollama scheduler relies on up-to-date available memory information, so this package provides the ability to refresh free memory as efficiently as possible.
|
3
fileutils/README.md
Normal file
3
fileutils/README.md
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
# `modelfile`
|
||||||
|
|
||||||
|
This package provides utilities for loading and inspecting model files
|
@ -1,9 +1,11 @@
|
|||||||
package llm
|
package fileutils
|
||||||
|
|
||||||
import "fmt"
|
import "fmt"
|
||||||
|
|
||||||
type fileType uint32
|
type fileType uint32
|
||||||
|
|
||||||
|
// TODO this should map over to the GGML CGO enum type
|
||||||
|
|
||||||
const (
|
const (
|
||||||
fileTypeF32 fileType = iota
|
fileTypeF32 fileType = iota
|
||||||
fileTypeF16
|
fileTypeF16
|
@ -1,4 +1,4 @@
|
|||||||
package llm
|
package fileutils
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/binary"
|
"encoding/binary"
|
@ -1,10 +1,11 @@
|
|||||||
package llm
|
package fileutils
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
|
"os"
|
||||||
"slices"
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
@ -488,3 +489,23 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
|||||||
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LoadModel will load a model from disk. The model must be in the GGML format.
|
||||||
|
//
|
||||||
|
// It collects array values for arrays with a size less than or equal to
|
||||||
|
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
|
||||||
|
// the maxArraySize is negative, all arrays are collected.
|
||||||
|
func LoadModel(model string, maxArraySize int) (*GGML, error) {
|
||||||
|
if _, err := os.Stat(model); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
f, err := os.Open(model)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
ggml, _, err := DecodeGGML(f, maxArraySize)
|
||||||
|
return ggml, err
|
||||||
|
}
|
1
fileutils/ggml_test.go
Normal file
1
fileutils/ggml_test.go
Normal file
@ -0,0 +1 @@
|
|||||||
|
package fileutils
|
@ -1,4 +1,4 @@
|
|||||||
package llm
|
package fileutils
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
@ -1,4 +1,4 @@
|
|||||||
package llm
|
package fileutils
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
@ -329,7 +329,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
|
|||||||
return estimate
|
return estimate
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m MemoryEstimate) log() {
|
func (m MemoryEstimate) Log() {
|
||||||
overhead := envconfig.GpuOverhead()
|
overhead := envconfig.GpuOverhead()
|
||||||
|
|
||||||
log := slog.With()
|
log := slog.With()
|
@ -1,4 +1,4 @@
|
|||||||
package llm
|
package fileutils
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
@ -1 +0,0 @@
|
|||||||
package llm
|
|
3
runners/README.md
Normal file
3
runners/README.md
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
# `runners`
|
||||||
|
|
||||||
|
Ollama uses a subprocess model to run one or more child processes to load the LLM. On some platforms (Linux non-containerized, MacOS) these executables are carried as payloads inside the main executable via the ../build package. Extraction and discovery of these runners at runtime is implemented in this package. This package also provides the abstraction to communicate with these subprocesses.
|
@ -2,6 +2,7 @@ package runners
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"compress/gzip"
|
"compress/gzip"
|
||||||
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
@ -15,9 +16,11 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"syscall"
|
"syscall"
|
||||||
|
"time"
|
||||||
|
|
||||||
"golang.org/x/sync/errgroup"
|
"golang.org/x/sync/errgroup"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/discover"
|
"github.com/ollama/ollama/discover"
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
)
|
)
|
||||||
@ -31,6 +34,36 @@ var (
|
|||||||
runnersDir = ""
|
runnersDir = ""
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type CompletionRequest struct {
|
||||||
|
Prompt string
|
||||||
|
Format string
|
||||||
|
Images []ImageData
|
||||||
|
Options *api.Options
|
||||||
|
}
|
||||||
|
|
||||||
|
type CompletionResponse struct {
|
||||||
|
Content string
|
||||||
|
DoneReason string
|
||||||
|
Done bool
|
||||||
|
PromptEvalCount int
|
||||||
|
PromptEvalDuration time.Duration
|
||||||
|
EvalCount int
|
||||||
|
EvalDuration time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
type LLMServer interface {
|
||||||
|
Ping(ctx context.Context) error
|
||||||
|
WaitUntilRunning(ctx context.Context) error
|
||||||
|
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
|
||||||
|
Embedding(ctx context.Context, input string) ([]float32, error)
|
||||||
|
Tokenize(ctx context.Context, content string) ([]int, error)
|
||||||
|
Detokenize(ctx context.Context, tokens []int) (string, error)
|
||||||
|
Close() error
|
||||||
|
EstimatedVRAM() uint64 // Total VRAM across all GPUs
|
||||||
|
EstimatedTotal() uint64
|
||||||
|
EstimatedVRAMByGPU(gpuID string) uint64
|
||||||
|
}
|
||||||
|
|
||||||
// Return the location where runners are stored
|
// Return the location where runners are stored
|
||||||
// If runners are payloads, this will either extract them
|
// If runners are payloads, this will either extract them
|
||||||
// or refresh them if any have disappeared due to tmp cleaners
|
// or refresh them if any have disappeared due to tmp cleaners
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
package llm
|
package runners
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
@ -28,24 +28,11 @@ import (
|
|||||||
"github.com/ollama/ollama/build"
|
"github.com/ollama/ollama/build"
|
||||||
"github.com/ollama/ollama/discover"
|
"github.com/ollama/ollama/discover"
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
|
"github.com/ollama/ollama/fileutils"
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/llama"
|
"github.com/ollama/ollama/llama"
|
||||||
"github.com/ollama/ollama/runners"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type LlamaServer interface {
|
|
||||||
Ping(ctx context.Context) error
|
|
||||||
WaitUntilRunning(ctx context.Context) error
|
|
||||||
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
|
|
||||||
Embedding(ctx context.Context, input string) ([]float32, error)
|
|
||||||
Tokenize(ctx context.Context, content string) ([]int, error)
|
|
||||||
Detokenize(ctx context.Context, tokens []int) (string, error)
|
|
||||||
Close() error
|
|
||||||
EstimatedVRAM() uint64 // Total VRAM across all GPUs
|
|
||||||
EstimatedTotal() uint64
|
|
||||||
EstimatedVRAMByGPU(gpuID string) uint64
|
|
||||||
}
|
|
||||||
|
|
||||||
// llmServer is an instance of the llama.cpp server
|
// llmServer is an instance of the llama.cpp server
|
||||||
type llmServer struct {
|
type llmServer struct {
|
||||||
port int
|
port int
|
||||||
@ -58,7 +45,7 @@ type llmServer struct {
|
|||||||
modelLock sync.Mutex // Temporary until we switch fully to Go server
|
modelLock sync.Mutex // Temporary until we switch fully to Go server
|
||||||
model *llama.Model // If non-nil, the runner is a new Go server
|
model *llama.Model // If non-nil, the runner is a new Go server
|
||||||
|
|
||||||
estimate MemoryEstimate
|
estimate fileutils.MemoryEstimate
|
||||||
totalLayers uint64
|
totalLayers uint64
|
||||||
// gpuCount int
|
// gpuCount int
|
||||||
gpus discover.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
|
gpus discover.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
|
||||||
@ -68,32 +55,12 @@ type llmServer struct {
|
|||||||
sem *semaphore.Weighted
|
sem *semaphore.Weighted
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadModel will load a model from disk. The model must be in the GGML format.
|
|
||||||
//
|
|
||||||
// It collects array values for arrays with a size less than or equal to
|
|
||||||
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
|
|
||||||
// the maxArraySize is negative, all arrays are collected.
|
|
||||||
func LoadModel(model string, maxArraySize int) (*GGML, error) {
|
|
||||||
if _, err := os.Stat(model); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
f, err := os.Open(model)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
defer f.Close()
|
|
||||||
|
|
||||||
ggml, _, err := DecodeGGML(f, maxArraySize)
|
|
||||||
return ggml, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// NewLlamaServer will run a server for the given GPUs
|
// NewLlamaServer will run a server for the given GPUs
|
||||||
// The gpu list must be a single family.
|
// The gpu list must be a single family.
|
||||||
func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
|
func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *fileutils.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LLMServer, error) {
|
||||||
var err error
|
var err error
|
||||||
var cpuRunner string
|
var cpuRunner string
|
||||||
var estimate MemoryEstimate
|
var estimate fileutils.MemoryEstimate
|
||||||
var systemTotalMemory uint64
|
var systemTotalMemory uint64
|
||||||
var systemFreeMemory uint64
|
var systemFreeMemory uint64
|
||||||
var systemSwapFreeMemory uint64
|
var systemSwapFreeMemory uint64
|
||||||
@ -109,10 +76,10 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
|||||||
gpus = discover.GetCPUInfo()
|
gpus = discover.GetCPUInfo()
|
||||||
}
|
}
|
||||||
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
||||||
cpuRunner = runners.ServerForCpu()
|
cpuRunner = ServerForCpu()
|
||||||
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
estimate = fileutils.EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||||
} else {
|
} else {
|
||||||
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
estimate = fileutils.EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
|
case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
|
||||||
@ -121,7 +88,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
|||||||
opts.NumGPU = 0
|
opts.NumGPU = 0
|
||||||
case gpus[0].Library != "metal" && estimate.Layers == 0:
|
case gpus[0].Library != "metal" && estimate.Layers == 0:
|
||||||
// Don't bother loading into the GPU if no layers can fit
|
// Don't bother loading into the GPU if no layers can fit
|
||||||
cpuRunner = runners.ServerForCpu()
|
cpuRunner = ServerForCpu()
|
||||||
gpus = discover.GetCPUInfo()
|
gpus = discover.GetCPUInfo()
|
||||||
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
|
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
|
||||||
opts.NumGPU = estimate.Layers
|
opts.NumGPU = estimate.Layers
|
||||||
@ -139,7 +106,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
estimate.log()
|
estimate.Log()
|
||||||
|
|
||||||
// Loop through potential servers
|
// Loop through potential servers
|
||||||
finalErr := errors.New("no suitable llama servers found")
|
finalErr := errors.New("no suitable llama servers found")
|
||||||
@ -148,12 +115,12 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
|||||||
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
|
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
|
||||||
}
|
}
|
||||||
|
|
||||||
rDir, err := runners.Refresh(build.EmbedFS)
|
rDir, err := Refresh(build.EmbedFS)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
availableServers := runners.GetAvailableServers(rDir)
|
availableServers := GetAvailableServers(rDir)
|
||||||
if len(availableServers) == 0 {
|
if len(availableServers) == 0 {
|
||||||
return nil, finalErr
|
return nil, finalErr
|
||||||
}
|
}
|
||||||
@ -161,7 +128,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
|||||||
if cpuRunner != "" {
|
if cpuRunner != "" {
|
||||||
servers = []string{cpuRunner}
|
servers = []string{cpuRunner}
|
||||||
} else {
|
} else {
|
||||||
servers = runners.ServersForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
|
servers = ServersForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
|
||||||
}
|
}
|
||||||
demandLib := envconfig.LLMLibrary()
|
demandLib := envconfig.LLMLibrary()
|
||||||
if demandLib != "" {
|
if demandLib != "" {
|
||||||
@ -325,7 +292,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
|||||||
_, err := os.Stat(server)
|
_, err := os.Stat(server)
|
||||||
if errors.Is(err, os.ErrNotExist) {
|
if errors.Is(err, os.ErrNotExist) {
|
||||||
slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err)
|
slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err)
|
||||||
_, err = runners.Refresh(build.EmbedFS)
|
_, err = Refresh(build.EmbedFS)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("failed to reinitialize payloads", "error", err)
|
slog.Warn("failed to reinitialize payloads", "error", err)
|
||||||
return nil, err
|
return nil, err
|
||||||
@ -673,23 +640,6 @@ type completion struct {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
type CompletionRequest struct {
|
|
||||||
Prompt string
|
|
||||||
Format string
|
|
||||||
Images []ImageData
|
|
||||||
Options *api.Options
|
|
||||||
}
|
|
||||||
|
|
||||||
type CompletionResponse struct {
|
|
||||||
Content string
|
|
||||||
DoneReason string
|
|
||||||
Done bool
|
|
||||||
PromptEvalCount int
|
|
||||||
PromptEvalDuration time.Duration
|
|
||||||
EvalCount int
|
|
||||||
EvalDuration time.Duration
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
|
func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
|
||||||
if err := s.sem.Acquire(ctx, 1); err != nil {
|
if err := s.sem.Acquire(ctx, 1); err != nil {
|
||||||
slog.Error("Failed to acquire semaphore", "error", err)
|
slog.Error("Failed to acquire semaphore", "error", err)
|
@ -1,4 +1,4 @@
|
|||||||
package llm
|
package runners
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
@ -1,4 +1,4 @@
|
|||||||
package llm
|
package runners
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"syscall"
|
"syscall"
|
@ -1,4 +1,4 @@
|
|||||||
package llm
|
package runners
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"syscall"
|
"syscall"
|
@ -1,4 +1,4 @@
|
|||||||
package llm
|
package runners
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"syscall"
|
"syscall"
|
@ -25,9 +25,9 @@ import (
|
|||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/auth"
|
"github.com/ollama/ollama/auth"
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
|
"github.com/ollama/ollama/fileutils"
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/llama"
|
"github.com/ollama/ollama/llama"
|
||||||
"github.com/ollama/ollama/llm"
|
|
||||||
"github.com/ollama/ollama/parser"
|
"github.com/ollama/ollama/parser"
|
||||||
"github.com/ollama/ollama/template"
|
"github.com/ollama/ollama/template"
|
||||||
"github.com/ollama/ollama/types/errtypes"
|
"github.com/ollama/ollama/types/errtypes"
|
||||||
@ -91,7 +91,7 @@ func (m *Model) CheckCapabilities(caps ...Capability) error {
|
|||||||
defer f.Close()
|
defer f.Close()
|
||||||
|
|
||||||
// TODO(mxyng): decode the GGML into model to avoid doing this multiple times
|
// TODO(mxyng): decode the GGML into model to avoid doing this multiple times
|
||||||
ggml, _, err := llm.DecodeGGML(f, 0)
|
ggml, _, err := fileutils.DecodeGGML(f, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Error("couldn't decode ggml", "error", err)
|
slog.Error("couldn't decode ggml", "error", err)
|
||||||
continue
|
continue
|
||||||
@ -431,7 +431,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
|
|||||||
baseLayer.MediaType == "application/vnd.ollama.image.model" &&
|
baseLayer.MediaType == "application/vnd.ollama.image.model" &&
|
||||||
baseLayer.GGML != nil &&
|
baseLayer.GGML != nil &&
|
||||||
baseLayer.GGML.Name() == "gguf" {
|
baseLayer.GGML.Name() == "gguf" {
|
||||||
want, err := llm.ParseFileType(quantization)
|
want, err := fileutils.ParseFileType(quantization)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@ -467,7 +467,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml, _, err := llm.DecodeGGML(temp, 0)
|
ggml, _, err := fileutils.DecodeGGML(temp, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -18,7 +18,7 @@ import (
|
|||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/convert"
|
"github.com/ollama/ollama/convert"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fileutils"
|
||||||
"github.com/ollama/ollama/template"
|
"github.com/ollama/ollama/template"
|
||||||
"github.com/ollama/ollama/types/model"
|
"github.com/ollama/ollama/types/model"
|
||||||
)
|
)
|
||||||
@ -27,7 +27,7 @@ var intermediateBlobs map[string]string = make(map[string]string)
|
|||||||
|
|
||||||
type layerGGML struct {
|
type layerGGML struct {
|
||||||
Layer
|
Layer
|
||||||
*llm.GGML
|
*fileutils.GGML
|
||||||
}
|
}
|
||||||
|
|
||||||
func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
|
func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
|
||||||
@ -67,7 +67,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
|
|||||||
}
|
}
|
||||||
defer blob.Close()
|
defer blob.Close()
|
||||||
|
|
||||||
ggml, _, err := llm.DecodeGGML(blob, 0)
|
ggml, _, err := fileutils.DecodeGGML(blob, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@ -112,7 +112,7 @@ func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML
|
|||||||
|
|
||||||
switch command {
|
switch command {
|
||||||
case "adapter":
|
case "adapter":
|
||||||
var baseModel *llm.GGML
|
var baseModel *fileutils.GGML
|
||||||
for _, l := range baseLayers {
|
for _, l := range baseLayers {
|
||||||
if l.GGML != nil {
|
if l.GGML != nil {
|
||||||
baseModel = l.GGML
|
baseModel = l.GGML
|
||||||
@ -150,7 +150,7 @@ func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML
|
|||||||
}
|
}
|
||||||
defer bin.Close()
|
defer bin.Close()
|
||||||
|
|
||||||
ggml, _, err := llm.DecodeGGML(bin, 0)
|
ggml, _, err := fileutils.DecodeGGML(bin, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@ -184,7 +184,7 @@ func parseFromFile(ctx context.Context, command string, baseLayers []*layerGGML,
|
|||||||
|
|
||||||
var offset int64
|
var offset int64
|
||||||
for offset < stat.Size() {
|
for offset < stat.Size() {
|
||||||
ggml, n, err := llm.DecodeGGML(file, 0)
|
ggml, n, err := fileutils.DecodeGGML(file, 0)
|
||||||
if errors.Is(err, io.EOF) {
|
if errors.Is(err, io.EOF) {
|
||||||
break
|
break
|
||||||
} else if err != nil {
|
} else if err != nil {
|
||||||
@ -263,7 +263,7 @@ func detectContentType(r io.Reader) (string, error) {
|
|||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
if contentType := llm.DetectGGMLType(b.Bytes()); contentType != "" {
|
if contentType := fileutils.DetectGGMLType(b.Bytes()); contentType != "" {
|
||||||
return contentType, nil
|
return contentType, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13,7 +13,7 @@ import (
|
|||||||
"github.com/google/go-cmp/cmp"
|
"github.com/google/go-cmp/cmp"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fileutils"
|
||||||
"github.com/ollama/ollama/template"
|
"github.com/ollama/ollama/template"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -147,7 +147,7 @@ func TestParseFromFileFromLayer(t *testing.T) {
|
|||||||
t.Fatalf("failed to open file: %v", err)
|
t.Fatalf("failed to open file: %v", err)
|
||||||
}
|
}
|
||||||
defer file.Close()
|
defer file.Close()
|
||||||
if err := llm.WriteGGUF(file, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil {
|
if err := fileutils.WriteGGUF(file, fileutils.KV{"general.architecture": "gemma"}, []fileutils.Tensor{}); err != nil {
|
||||||
t.Fatalf("failed to write gguf: %v", err)
|
t.Fatalf("failed to write gguf: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -200,7 +200,7 @@ func TestParseLayerFromCopy(t *testing.T) {
|
|||||||
defer file2.Close()
|
defer file2.Close()
|
||||||
|
|
||||||
for range 5 {
|
for range 5 {
|
||||||
if err := llm.WriteGGUF(file2, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil {
|
if err := fileutils.WriteGGUF(file2, fileutils.KV{"general.architecture": "gemma"}, []fileutils.Tensor{}); err != nil {
|
||||||
t.Fatalf("failed to write gguf: %v", err)
|
t.Fatalf("failed to write gguf: %v", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -10,7 +10,7 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/runners"
|
||||||
"github.com/ollama/ollama/server/imageproc"
|
"github.com/ollama/ollama/server/imageproc"
|
||||||
"github.com/ollama/ollama/template"
|
"github.com/ollama/ollama/template"
|
||||||
)
|
)
|
||||||
@ -22,7 +22,7 @@ var errTooManyImages = errors.New("vision model only supports a single image per
|
|||||||
// chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn.
|
// chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn.
|
||||||
// chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the
|
// chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the
|
||||||
// latest message and 2) system messages
|
// latest message and 2) system messages
|
||||||
func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) {
|
func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []runners.ImageData, _ error) {
|
||||||
var system []api.Message
|
var system []api.Message
|
||||||
|
|
||||||
isMllama := checkMllamaModelFamily(m)
|
isMllama := checkMllamaModelFamily(m)
|
||||||
@ -90,7 +90,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
|||||||
return "", nil, err
|
return "", nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
imgData := llm.ImageData{
|
imgData := runners.ImageData{
|
||||||
Data: buf.Bytes(),
|
Data: buf.Bytes(),
|
||||||
AspectRatioID: aspectRatioID,
|
AspectRatioID: aspectRatioID,
|
||||||
}
|
}
|
||||||
@ -105,7 +105,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
|||||||
prefix := ""
|
prefix := ""
|
||||||
prompt := msg.Content
|
prompt := msg.Content
|
||||||
for _, i := range msg.Images {
|
for _, i := range msg.Images {
|
||||||
imgData := llm.ImageData{
|
imgData := runners.ImageData{
|
||||||
ID: len(images),
|
ID: len(images),
|
||||||
Data: i,
|
Data: i,
|
||||||
}
|
}
|
||||||
|
@ -29,7 +29,7 @@ import (
|
|||||||
"github.com/ollama/ollama/build"
|
"github.com/ollama/ollama/build"
|
||||||
"github.com/ollama/ollama/discover"
|
"github.com/ollama/ollama/discover"
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fileutils"
|
||||||
"github.com/ollama/ollama/openai"
|
"github.com/ollama/ollama/openai"
|
||||||
"github.com/ollama/ollama/parser"
|
"github.com/ollama/ollama/parser"
|
||||||
"github.com/ollama/ollama/runners"
|
"github.com/ollama/ollama/runners"
|
||||||
@ -78,7 +78,7 @@ func modelOptions(model *Model, requestOpts map[string]interface{}) (api.Options
|
|||||||
|
|
||||||
// scheduleRunner schedules a runner after validating inputs such as capabilities and model options.
|
// scheduleRunner schedules a runner after validating inputs such as capabilities and model options.
|
||||||
// It returns the allocated runner, model instance, and consolidated options if successful and error otherwise.
|
// It returns the allocated runner, model instance, and consolidated options if successful and error otherwise.
|
||||||
func (s *Server) scheduleRunner(ctx context.Context, name string, caps []Capability, requestOpts map[string]any, keepAlive *api.Duration) (llm.LlamaServer, *Model, *api.Options, error) {
|
func (s *Server) scheduleRunner(ctx context.Context, name string, caps []Capability, requestOpts map[string]any, keepAlive *api.Duration) (runners.LLMServer, *Model, *api.Options, error) {
|
||||||
if name == "" {
|
if name == "" {
|
||||||
return nil, nil, nil, fmt.Errorf("model %w", errRequired)
|
return nil, nil, nil, fmt.Errorf("model %w", errRequired)
|
||||||
}
|
}
|
||||||
@ -187,9 +187,9 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
images := make([]llm.ImageData, len(req.Images))
|
images := make([]runners.ImageData, len(req.Images))
|
||||||
for i := range req.Images {
|
for i := range req.Images {
|
||||||
images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
|
images[i] = runners.ImageData{ID: i, Data: req.Images[i]}
|
||||||
}
|
}
|
||||||
|
|
||||||
prompt := req.Prompt
|
prompt := req.Prompt
|
||||||
@ -255,12 +255,12 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
|||||||
// TODO (jmorganca): avoid building the response twice both here and below
|
// TODO (jmorganca): avoid building the response twice both here and below
|
||||||
var sb strings.Builder
|
var sb strings.Builder
|
||||||
defer close(ch)
|
defer close(ch)
|
||||||
if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
|
if err := r.Completion(c.Request.Context(), runners.CompletionRequest{
|
||||||
Prompt: prompt,
|
Prompt: prompt,
|
||||||
Images: images,
|
Images: images,
|
||||||
Format: req.Format,
|
Format: req.Format,
|
||||||
Options: opts,
|
Options: opts,
|
||||||
}, func(cr llm.CompletionResponse) {
|
}, func(cr runners.CompletionResponse) {
|
||||||
res := api.GenerateResponse{
|
res := api.GenerateResponse{
|
||||||
Model: req.Model,
|
Model: req.Model,
|
||||||
CreatedAt: time.Now().UTC(),
|
CreatedAt: time.Now().UTC(),
|
||||||
@ -639,7 +639,7 @@ func (s *Server) CreateHandler(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if r.Path == "" && r.Modelfile == "" {
|
if r.Path == "" && r.Modelfile == "" {
|
||||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "path or modelfile are required"})
|
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "path or fileutils are required"})
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -647,7 +647,7 @@ func (s *Server) CreateHandler(c *gin.Context) {
|
|||||||
if r.Path != "" && r.Modelfile == "" {
|
if r.Path != "" && r.Modelfile == "" {
|
||||||
f, err := os.Open(r.Path)
|
f, err := os.Open(r.Path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("error reading modelfile: %s", err)})
|
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("error reading fileutils: %s", err)})
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
@ -851,12 +851,12 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
|
|||||||
return resp, nil
|
return resp, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func getKVData(digest string, verbose bool) (llm.KV, error) {
|
func getKVData(digest string, verbose bool) (fileutils.KV, error) {
|
||||||
maxArraySize := 0
|
maxArraySize := 0
|
||||||
if verbose {
|
if verbose {
|
||||||
maxArraySize = -1
|
maxArraySize = -1
|
||||||
}
|
}
|
||||||
kvData, err := llm.LoadModel(digest, maxArraySize)
|
kvData, err := fileutils.LoadModel(digest, maxArraySize)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@ -1436,12 +1436,12 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
|||||||
ch := make(chan any)
|
ch := make(chan any)
|
||||||
go func() {
|
go func() {
|
||||||
defer close(ch)
|
defer close(ch)
|
||||||
if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
|
if err := r.Completion(c.Request.Context(), runners.CompletionRequest{
|
||||||
Prompt: prompt,
|
Prompt: prompt,
|
||||||
Images: images,
|
Images: images,
|
||||||
Format: req.Format,
|
Format: req.Format,
|
||||||
Options: opts,
|
Options: opts,
|
||||||
}, func(r llm.CompletionResponse) {
|
}, func(r runners.CompletionResponse) {
|
||||||
res := api.ChatResponse{
|
res := api.ChatResponse{
|
||||||
Model: req.Model,
|
Model: req.Model,
|
||||||
CreatedAt: time.Now().UTC(),
|
CreatedAt: time.Now().UTC(),
|
||||||
|
@ -16,12 +16,12 @@ import (
|
|||||||
"github.com/gin-gonic/gin"
|
"github.com/gin-gonic/gin"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fileutils"
|
||||||
)
|
)
|
||||||
|
|
||||||
var stream bool = false
|
var stream bool = false
|
||||||
|
|
||||||
func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) string {
|
func createBinFile(t *testing.T, kv map[string]any, ti []fileutils.Tensor) string {
|
||||||
t.Helper()
|
t.Helper()
|
||||||
|
|
||||||
f, err := os.CreateTemp(t.TempDir(), "")
|
f, err := os.CreateTemp(t.TempDir(), "")
|
||||||
@ -30,7 +30,7 @@ func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) string {
|
|||||||
}
|
}
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
|
|
||||||
if err := llm.WriteGGUF(f, kv, ti); err != nil {
|
if err := fileutils.WriteGGUF(f, kv, ti); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -581,7 +581,7 @@ func TestCreateDetectTemplate(t *testing.T) {
|
|||||||
t.Run("matched", func(t *testing.T) {
|
t.Run("matched", func(t *testing.T) {
|
||||||
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
||||||
Name: "test",
|
Name: "test",
|
||||||
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
|
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, fileutils.KV{
|
||||||
"tokenizer.chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
|
"tokenizer.chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
|
||||||
}, nil)),
|
}, nil)),
|
||||||
Stream: &stream,
|
Stream: &stream,
|
||||||
|
@ -16,18 +16,19 @@ import (
|
|||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/discover"
|
"github.com/ollama/ollama/discover"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fileutils"
|
||||||
|
"github.com/ollama/ollama/runners"
|
||||||
)
|
)
|
||||||
|
|
||||||
type mockRunner struct {
|
type mockRunner struct {
|
||||||
llm.LlamaServer
|
runners.LLMServer
|
||||||
|
|
||||||
// CompletionRequest is only valid until the next call to Completion
|
// CompletionRequest is only valid until the next call to Completion
|
||||||
llm.CompletionRequest
|
runners.CompletionRequest
|
||||||
llm.CompletionResponse
|
runners.CompletionResponse
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *mockRunner) Completion(_ context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
|
func (m *mockRunner) Completion(_ context.Context, r runners.CompletionRequest, fn func(r runners.CompletionResponse)) error {
|
||||||
m.CompletionRequest = r
|
m.CompletionRequest = r
|
||||||
fn(m.CompletionResponse)
|
fn(m.CompletionResponse)
|
||||||
return nil
|
return nil
|
||||||
@ -41,8 +42,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
|
func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *fileutils.GGML, []string, []string, api.Options, int) (runners.LLMServer, error) {
|
||||||
return func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
return func(gpus discover.GpuInfoList, model string, ggml *fileutils.GGML, projectors, system []string, opts api.Options, numParallel int) (runners.LLMServer, error) {
|
||||||
return mock, nil
|
return mock, nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -51,7 +52,7 @@ func TestGenerateChat(t *testing.T) {
|
|||||||
gin.SetMode(gin.TestMode)
|
gin.SetMode(gin.TestMode)
|
||||||
|
|
||||||
mock := mockRunner{
|
mock := mockRunner{
|
||||||
CompletionResponse: llm.CompletionResponse{
|
CompletionResponse: runners.CompletionResponse{
|
||||||
Done: true,
|
Done: true,
|
||||||
DoneReason: "stop",
|
DoneReason: "stop",
|
||||||
PromptEvalCount: 1,
|
PromptEvalCount: 1,
|
||||||
@ -72,7 +73,7 @@ func TestGenerateChat(t *testing.T) {
|
|||||||
getGpuFn: discover.GetGPUInfo,
|
getGpuFn: discover.GetGPUInfo,
|
||||||
getCpuFn: discover.GetCPUInfo,
|
getCpuFn: discover.GetCPUInfo,
|
||||||
reschedDelay: 250 * time.Millisecond,
|
reschedDelay: 250 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
|
loadFn: func(req *LlmRequest, ggml *fileutils.GGML, gpus discover.GpuInfoList, numParallel int) {
|
||||||
// add small delay to simulate loading
|
// add small delay to simulate loading
|
||||||
time.Sleep(time.Millisecond)
|
time.Sleep(time.Millisecond)
|
||||||
req.successCh <- &runnerRef{
|
req.successCh <- &runnerRef{
|
||||||
@ -91,7 +92,7 @@ func TestGenerateChat(t *testing.T) {
|
|||||||
{{- if .System }}System: {{ .System }} {{ end }}
|
{{- if .System }}System: {{ .System }} {{ end }}
|
||||||
{{- if .Prompt }}User: {{ .Prompt }} {{ end }}
|
{{- if .Prompt }}User: {{ .Prompt }} {{ end }}
|
||||||
{{- if .Response }}Assistant: {{ .Response }} {{ end }}"""
|
{{- if .Response }}Assistant: {{ .Response }} {{ end }}"""
|
||||||
`, createBinFile(t, llm.KV{
|
`, createBinFile(t, fileutils.KV{
|
||||||
"general.architecture": "llama",
|
"general.architecture": "llama",
|
||||||
"llama.block_count": uint32(1),
|
"llama.block_count": uint32(1),
|
||||||
"llama.context_length": uint32(8192),
|
"llama.context_length": uint32(8192),
|
||||||
@ -101,7 +102,7 @@ func TestGenerateChat(t *testing.T) {
|
|||||||
"tokenizer.ggml.tokens": []string{""},
|
"tokenizer.ggml.tokens": []string{""},
|
||||||
"tokenizer.ggml.scores": []float32{0},
|
"tokenizer.ggml.scores": []float32{0},
|
||||||
"tokenizer.ggml.token_type": []int32{0},
|
"tokenizer.ggml.token_type": []int32{0},
|
||||||
}, []llm.Tensor{
|
}, []fileutils.Tensor{
|
||||||
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||||
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||||
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||||
@ -146,10 +147,10 @@ func TestGenerateChat(t *testing.T) {
|
|||||||
t.Run("missing capabilities chat", func(t *testing.T) {
|
t.Run("missing capabilities chat", func(t *testing.T) {
|
||||||
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
||||||
Model: "bert",
|
Model: "bert",
|
||||||
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
|
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, fileutils.KV{
|
||||||
"general.architecture": "bert",
|
"general.architecture": "bert",
|
||||||
"bert.pooling_type": uint32(0),
|
"bert.pooling_type": uint32(0),
|
||||||
}, []llm.Tensor{})),
|
}, []fileutils.Tensor{})),
|
||||||
Stream: &stream,
|
Stream: &stream,
|
||||||
})
|
})
|
||||||
|
|
||||||
@ -349,7 +350,7 @@ func TestGenerate(t *testing.T) {
|
|||||||
gin.SetMode(gin.TestMode)
|
gin.SetMode(gin.TestMode)
|
||||||
|
|
||||||
mock := mockRunner{
|
mock := mockRunner{
|
||||||
CompletionResponse: llm.CompletionResponse{
|
CompletionResponse: runners.CompletionResponse{
|
||||||
Done: true,
|
Done: true,
|
||||||
DoneReason: "stop",
|
DoneReason: "stop",
|
||||||
PromptEvalCount: 1,
|
PromptEvalCount: 1,
|
||||||
@ -370,7 +371,7 @@ func TestGenerate(t *testing.T) {
|
|||||||
getGpuFn: discover.GetGPUInfo,
|
getGpuFn: discover.GetGPUInfo,
|
||||||
getCpuFn: discover.GetCPUInfo,
|
getCpuFn: discover.GetCPUInfo,
|
||||||
reschedDelay: 250 * time.Millisecond,
|
reschedDelay: 250 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
|
loadFn: func(req *LlmRequest, ggml *fileutils.GGML, gpus discover.GpuInfoList, numParallel int) {
|
||||||
// add small delay to simulate loading
|
// add small delay to simulate loading
|
||||||
time.Sleep(time.Millisecond)
|
time.Sleep(time.Millisecond)
|
||||||
req.successCh <- &runnerRef{
|
req.successCh <- &runnerRef{
|
||||||
@ -389,7 +390,7 @@ func TestGenerate(t *testing.T) {
|
|||||||
{{- if .System }}System: {{ .System }} {{ end }}
|
{{- if .System }}System: {{ .System }} {{ end }}
|
||||||
{{- if .Prompt }}User: {{ .Prompt }} {{ end }}
|
{{- if .Prompt }}User: {{ .Prompt }} {{ end }}
|
||||||
{{- if .Response }}Assistant: {{ .Response }} {{ end }}"""
|
{{- if .Response }}Assistant: {{ .Response }} {{ end }}"""
|
||||||
`, createBinFile(t, llm.KV{
|
`, createBinFile(t, fileutils.KV{
|
||||||
"general.architecture": "llama",
|
"general.architecture": "llama",
|
||||||
"llama.block_count": uint32(1),
|
"llama.block_count": uint32(1),
|
||||||
"llama.context_length": uint32(8192),
|
"llama.context_length": uint32(8192),
|
||||||
@ -399,7 +400,7 @@ func TestGenerate(t *testing.T) {
|
|||||||
"tokenizer.ggml.tokens": []string{""},
|
"tokenizer.ggml.tokens": []string{""},
|
||||||
"tokenizer.ggml.scores": []float32{0},
|
"tokenizer.ggml.scores": []float32{0},
|
||||||
"tokenizer.ggml.token_type": []int32{0},
|
"tokenizer.ggml.token_type": []int32{0},
|
||||||
}, []llm.Tensor{
|
}, []fileutils.Tensor{
|
||||||
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||||
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||||
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||||
@ -444,10 +445,10 @@ func TestGenerate(t *testing.T) {
|
|||||||
t.Run("missing capabilities generate", func(t *testing.T) {
|
t.Run("missing capabilities generate", func(t *testing.T) {
|
||||||
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
||||||
Model: "bert",
|
Model: "bert",
|
||||||
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
|
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, fileutils.KV{
|
||||||
"general.architecture": "bert",
|
"general.architecture": "bert",
|
||||||
"bert.pooling_type": uint32(0),
|
"bert.pooling_type": uint32(0),
|
||||||
}, []llm.Tensor{})),
|
}, []fileutils.Tensor{})),
|
||||||
Stream: &stream,
|
Stream: &stream,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
@ -16,7 +16,7 @@ import (
|
|||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fileutils"
|
||||||
"github.com/ollama/ollama/openai"
|
"github.com/ollama/ollama/openai"
|
||||||
"github.com/ollama/ollama/parser"
|
"github.com/ollama/ollama/parser"
|
||||||
"github.com/ollama/ollama/types/model"
|
"github.com/ollama/ollama/types/model"
|
||||||
@ -83,14 +83,14 @@ func Test_Routes(t *testing.T) {
|
|||||||
fname := createTestFile(t, "ollama-model")
|
fname := createTestFile(t, "ollama-model")
|
||||||
|
|
||||||
r := strings.NewReader(fmt.Sprintf("FROM %s\nPARAMETER seed 42\nPARAMETER top_p 0.9\nPARAMETER stop foo\nPARAMETER stop bar", fname))
|
r := strings.NewReader(fmt.Sprintf("FROM %s\nPARAMETER seed 42\nPARAMETER top_p 0.9\nPARAMETER stop foo\nPARAMETER stop bar", fname))
|
||||||
modelfile, err := parser.ParseFile(r)
|
fileutils, err := parser.ParseFile(r)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("failed to parse file: %v", err)
|
t.Fatalf("failed to parse file: %v", err)
|
||||||
}
|
}
|
||||||
fn := func(resp api.ProgressResponse) {
|
fn := func(resp api.ProgressResponse) {
|
||||||
t.Logf("Status: %s", resp.Status)
|
t.Logf("Status: %s", resp.Status)
|
||||||
}
|
}
|
||||||
err = CreateModel(context.TODO(), model.ParseName(name), "", "", modelfile, fn)
|
err = CreateModel(context.TODO(), model.ParseName(name), "", "", fileutils, fn)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("failed to create model: %v", err)
|
t.Fatalf("failed to create model: %v", err)
|
||||||
}
|
}
|
||||||
@ -561,8 +561,8 @@ func TestShow(t *testing.T) {
|
|||||||
Name: "show-model",
|
Name: "show-model",
|
||||||
Modelfile: fmt.Sprintf(
|
Modelfile: fmt.Sprintf(
|
||||||
"FROM %s\nFROM %s",
|
"FROM %s\nFROM %s",
|
||||||
createBinFile(t, llm.KV{"general.architecture": "test"}, nil),
|
createBinFile(t, fileutils.KV{"general.architecture": "test"}, nil),
|
||||||
createBinFile(t, llm.KV{"general.type": "projector", "general.architecture": "clip"}, nil),
|
createBinFile(t, fileutils.KV{"general.type": "projector", "general.architecture": "clip"}, nil),
|
||||||
),
|
),
|
||||||
})
|
})
|
||||||
|
|
||||||
|
@ -17,8 +17,9 @@ import (
|
|||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/discover"
|
"github.com/ollama/ollama/discover"
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
|
"github.com/ollama/ollama/fileutils"
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/runners"
|
||||||
)
|
)
|
||||||
|
|
||||||
type LlmRequest struct {
|
type LlmRequest struct {
|
||||||
@ -41,8 +42,8 @@ type Scheduler struct {
|
|||||||
loaded map[string]*runnerRef
|
loaded map[string]*runnerRef
|
||||||
loadedMu sync.Mutex
|
loadedMu sync.Mutex
|
||||||
|
|
||||||
loadFn func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int)
|
loadFn func(req *LlmRequest, ggml *fileutils.GGML, gpus discover.GpuInfoList, numParallel int)
|
||||||
newServerFn func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
|
newServerFn func(gpus discover.GpuInfoList, model string, ggml *fileutils.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (runners.LLMServer, error)
|
||||||
getGpuFn func() discover.GpuInfoList
|
getGpuFn func() discover.GpuInfoList
|
||||||
getCpuFn func() discover.GpuInfoList
|
getCpuFn func() discover.GpuInfoList
|
||||||
reschedDelay time.Duration
|
reschedDelay time.Duration
|
||||||
@ -68,7 +69,7 @@ func InitScheduler(ctx context.Context) *Scheduler {
|
|||||||
expiredCh: make(chan *runnerRef, maxQueue),
|
expiredCh: make(chan *runnerRef, maxQueue),
|
||||||
unloadedCh: make(chan interface{}, maxQueue),
|
unloadedCh: make(chan interface{}, maxQueue),
|
||||||
loaded: make(map[string]*runnerRef),
|
loaded: make(map[string]*runnerRef),
|
||||||
newServerFn: llm.NewLlamaServer,
|
newServerFn: runners.NewLlamaServer,
|
||||||
getGpuFn: discover.GetGPUInfo,
|
getGpuFn: discover.GetGPUInfo,
|
||||||
getCpuFn: discover.GetCPUInfo,
|
getCpuFn: discover.GetCPUInfo,
|
||||||
reschedDelay: 250 * time.Millisecond,
|
reschedDelay: 250 * time.Millisecond,
|
||||||
@ -187,7 +188,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Load model for fitting
|
// Load model for fitting
|
||||||
ggml, err := llm.LoadModel(pending.model.ModelPath, 0)
|
ggml, err := fileutils.LoadModel(pending.model.ModelPath, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
pending.errCh <- err
|
pending.errCh <- err
|
||||||
break
|
break
|
||||||
@ -409,7 +410,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
|
|||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
|
func (s *Scheduler) load(req *LlmRequest, ggml *fileutils.GGML, gpus discover.GpuInfoList, numParallel int) {
|
||||||
if numParallel < 1 {
|
if numParallel < 1 {
|
||||||
numParallel = 1
|
numParallel = 1
|
||||||
}
|
}
|
||||||
@ -422,7 +423,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoL
|
|||||||
// some older models are not compatible with newer versions of llama.cpp
|
// some older models are not compatible with newer versions of llama.cpp
|
||||||
// show a generalized compatibility error until there is a better way to
|
// show a generalized compatibility error until there is a better way to
|
||||||
// check for model compatibility
|
// check for model compatibility
|
||||||
if errors.Is(err, llm.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
|
if errors.Is(err, fileutils.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
|
||||||
err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
|
err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
|
||||||
}
|
}
|
||||||
slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
|
slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
|
||||||
@ -540,7 +541,7 @@ type runnerRef struct {
|
|||||||
refCount uint // prevent unloading if > 0
|
refCount uint // prevent unloading if > 0
|
||||||
// unloading bool // set to true when we are trying to unload the runner
|
// unloading bool // set to true when we are trying to unload the runner
|
||||||
|
|
||||||
llama llm.LlamaServer
|
llama runners.LLMServer
|
||||||
loading bool // True only during initial load, then false forever
|
loading bool // True only during initial load, then false forever
|
||||||
gpus discover.GpuInfoList // Recorded at time of provisioning
|
gpus discover.GpuInfoList // Recorded at time of provisioning
|
||||||
estimatedVRAM uint64
|
estimatedVRAM uint64
|
||||||
@ -685,7 +686,7 @@ func (a ByDuration) Less(i, j int) bool {
|
|||||||
// If the model can not be fit fully within the available GPU(s) nil is returned
|
// If the model can not be fit fully within the available GPU(s) nil is returned
|
||||||
// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
|
// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
|
||||||
// opts.NumCtx accordingly
|
// opts.NumCtx accordingly
|
||||||
func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
|
func pickBestFullFitByLibrary(req *LlmRequest, ggml *fileutils.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
|
||||||
var estimatedVRAM uint64
|
var estimatedVRAM uint64
|
||||||
|
|
||||||
var numParallelToTry []int
|
var numParallelToTry []int
|
||||||
@ -710,7 +711,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu
|
|||||||
req.opts.NumCtx = req.origNumCtx * p
|
req.opts.NumCtx = req.origNumCtx * p
|
||||||
if !envconfig.SchedSpread() {
|
if !envconfig.SchedSpread() {
|
||||||
for _, g := range sgl {
|
for _, g := range sgl {
|
||||||
if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
if ok, estimatedVRAM = fileutils.PredictServerFit([]discover.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
||||||
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
||||||
*numParallel = p
|
*numParallel = p
|
||||||
return []discover.GpuInfo{g}
|
return []discover.GpuInfo{g}
|
||||||
@ -726,7 +727,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu
|
|||||||
// Now try all the GPUs
|
// Now try all the GPUs
|
||||||
for _, p := range numParallelToTry {
|
for _, p := range numParallelToTry {
|
||||||
req.opts.NumCtx = req.origNumCtx * p
|
req.opts.NumCtx = req.origNumCtx * p
|
||||||
if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
if ok, estimatedVRAM = fileutils.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
||||||
slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
|
slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
|
||||||
*numParallel = p
|
*numParallel = p
|
||||||
return sgl
|
return sgl
|
||||||
@ -737,7 +738,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu
|
|||||||
}
|
}
|
||||||
|
|
||||||
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
|
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
|
||||||
func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
|
func pickBestPartialFitByLibrary(req *LlmRequest, ggml *fileutils.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
|
||||||
if *numParallel <= 0 {
|
if *numParallel <= 0 {
|
||||||
*numParallel = 1
|
*numParallel = 1
|
||||||
req.opts.NumCtx = req.origNumCtx
|
req.opts.NumCtx = req.origNumCtx
|
||||||
@ -749,7 +750,7 @@ func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.
|
|||||||
var bestEstimate uint64
|
var bestEstimate uint64
|
||||||
var bestFit int
|
var bestFit int
|
||||||
for i, gl := range byLibrary {
|
for i, gl := range byLibrary {
|
||||||
_, estimatedVRAM := llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
|
_, estimatedVRAM := fileutils.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
|
||||||
if estimatedVRAM > bestEstimate {
|
if estimatedVRAM > bestEstimate {
|
||||||
bestEstimate = estimatedVRAM
|
bestEstimate = estimatedVRAM
|
||||||
bestFit = i
|
bestFit = i
|
||||||
@ -822,9 +823,9 @@ func (s *Scheduler) expireRunner(model *Model) {
|
|||||||
|
|
||||||
// If other runners are loaded, make sure the pending request will fit in system memory
|
// If other runners are loaded, make sure the pending request will fit in system memory
|
||||||
// If not, pick a runner to unload, else return nil and the request can be loaded
|
// If not, pick a runner to unload, else return nil and the request can be loaded
|
||||||
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList) *runnerRef {
|
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *fileutils.GGML, gpus discover.GpuInfoList) *runnerRef {
|
||||||
slog.Debug("evaluating if CPU model load will fit in available system memory")
|
slog.Debug("evaluating if CPU model load will fit in available system memory")
|
||||||
estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
|
estimate := fileutils.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
|
||||||
if estimate.TotalSize <= gpus[0].FreeMemory {
|
if estimate.TotalSize <= gpus[0].FreeMemory {
|
||||||
slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
|
slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
|
||||||
return nil
|
return nil
|
||||||
|
@ -14,8 +14,9 @@ import (
|
|||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/app/lifecycle"
|
"github.com/ollama/ollama/app/lifecycle"
|
||||||
"github.com/ollama/ollama/discover"
|
"github.com/ollama/ollama/discover"
|
||||||
|
"github.com/ollama/ollama/fileutils"
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/runners"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestMain(m *testing.M) {
|
func TestMain(m *testing.M) {
|
||||||
@ -37,7 +38,7 @@ func TestLoad(t *testing.T) {
|
|||||||
ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
|
ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
|
||||||
defer done()
|
defer done()
|
||||||
s := InitScheduler(ctx)
|
s := InitScheduler(ctx)
|
||||||
var ggml *llm.GGML // value not used in tests
|
var ggml *fileutils.GGML // value not used in tests
|
||||||
req := &LlmRequest{
|
req := &LlmRequest{
|
||||||
ctx: ctx,
|
ctx: ctx,
|
||||||
model: &Model{ModelPath: "foo"},
|
model: &Model{ModelPath: "foo"},
|
||||||
@ -47,7 +48,7 @@ func TestLoad(t *testing.T) {
|
|||||||
sessionDuration: &api.Duration{Duration: 2 * time.Second},
|
sessionDuration: &api.Duration{Duration: 2 * time.Second},
|
||||||
}
|
}
|
||||||
// Fail to load model first
|
// Fail to load model first
|
||||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *fileutils.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (runners.LLMServer, error) {
|
||||||
return nil, errors.New("something failed to load model blah")
|
return nil, errors.New("something failed to load model blah")
|
||||||
}
|
}
|
||||||
gpus := discover.GpuInfoList{}
|
gpus := discover.GpuInfoList{}
|
||||||
@ -61,7 +62,7 @@ func TestLoad(t *testing.T) {
|
|||||||
require.Contains(t, err.Error(), "this model may be incompatible")
|
require.Contains(t, err.Error(), "this model may be incompatible")
|
||||||
|
|
||||||
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
|
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
|
||||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *fileutils.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (runners.LLMServer, error) {
|
||||||
return server, nil
|
return server, nil
|
||||||
}
|
}
|
||||||
s.load(req, ggml, gpus, 0)
|
s.load(req, ggml, gpus, 0)
|
||||||
@ -99,10 +100,10 @@ type reqBundle struct {
|
|||||||
ctxDone func()
|
ctxDone func()
|
||||||
srv *mockLlm
|
srv *mockLlm
|
||||||
req *LlmRequest
|
req *LlmRequest
|
||||||
ggml *llm.GGML
|
ggml *fileutils.GGML
|
||||||
}
|
}
|
||||||
|
|
||||||
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, ggml *fileutils.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (runners.LLMServer, error) {
|
||||||
return scenario.srv, nil
|
return scenario.srv, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -115,7 +116,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
|
|||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
|
|
||||||
require.NoError(t, llm.WriteGGUF(f, llm.KV{
|
require.NoError(t, fileutils.WriteGGUF(f, fileutils.KV{
|
||||||
"general.architecture": "llama",
|
"general.architecture": "llama",
|
||||||
"llama.context_length": uint32(32),
|
"llama.context_length": uint32(32),
|
||||||
"llama.embedding_length": uint32(4096),
|
"llama.embedding_length": uint32(4096),
|
||||||
@ -125,7 +126,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
|
|||||||
"tokenizer.ggml.tokens": []string{" "},
|
"tokenizer.ggml.tokens": []string{" "},
|
||||||
"tokenizer.ggml.scores": []float32{0},
|
"tokenizer.ggml.scores": []float32{0},
|
||||||
"tokenizer.ggml.token_type": []int32{0},
|
"tokenizer.ggml.token_type": []int32{0},
|
||||||
}, []llm.Tensor{
|
}, []fileutils.Tensor{
|
||||||
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
}))
|
}))
|
||||||
@ -133,7 +134,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
|
|||||||
|
|
||||||
fname := f.Name()
|
fname := f.Name()
|
||||||
model := &Model{Name: modelName, ModelPath: fname}
|
model := &Model{Name: modelName, ModelPath: fname}
|
||||||
b.ggml, err = llm.LoadModel(model.ModelPath, 0)
|
b.ggml, err = fileutils.LoadModel(model.ModelPath, 0)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
if duration == nil {
|
if duration == nil {
|
||||||
@ -419,10 +420,10 @@ func TestExpireRunner(t *testing.T) {
|
|||||||
sessionDuration: &api.Duration{Duration: 2 * time.Minute},
|
sessionDuration: &api.Duration{Duration: 2 * time.Minute},
|
||||||
}
|
}
|
||||||
|
|
||||||
var ggml *llm.GGML
|
var ggml *fileutils.GGML
|
||||||
gpus := discover.GpuInfoList{}
|
gpus := discover.GpuInfoList{}
|
||||||
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
|
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
|
||||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *fileutils.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (runners.LLMServer, error) {
|
||||||
return server, nil
|
return server, nil
|
||||||
}
|
}
|
||||||
s.load(req, ggml, gpus, 0)
|
s.load(req, ggml, gpus, 0)
|
||||||
@ -729,7 +730,7 @@ func TestHomogeneousGPUs(t *testing.T) {
|
|||||||
}
|
}
|
||||||
s.getCpuFn = getCpuFn
|
s.getCpuFn = getCpuFn
|
||||||
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
|
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
|
||||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *fileutils.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (runners.LLMServer, error) {
|
||||||
require.Len(t, gpus, 1)
|
require.Len(t, gpus, 1)
|
||||||
return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
|
return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
|
||||||
}
|
}
|
||||||
@ -768,7 +769,7 @@ type mockLlm struct {
|
|||||||
|
|
||||||
func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
|
func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
|
||||||
func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
|
func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
|
||||||
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
|
func (s *mockLlm) Completion(ctx context.Context, req runners.CompletionRequest, fn func(runners.CompletionResponse)) error {
|
||||||
return s.completionResp
|
return s.completionResp
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -14,7 +14,7 @@ import (
|
|||||||
"github.com/google/go-cmp/cmp"
|
"github.com/google/go-cmp/cmp"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fileutils"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestNamed(t *testing.T) {
|
func TestNamed(t *testing.T) {
|
||||||
@ -33,7 +33,7 @@ func TestNamed(t *testing.T) {
|
|||||||
|
|
||||||
for k, v := range ss {
|
for k, v := range ss {
|
||||||
t.Run(k, func(t *testing.T) {
|
t.Run(k, func(t *testing.T) {
|
||||||
kv := llm.KV{"tokenizer.chat_template": v}
|
kv := fileutils.KV{"tokenizer.chat_template": v}
|
||||||
s := kv.ChatTemplate()
|
s := kv.ChatTemplate()
|
||||||
r, err := Named(s)
|
r, err := Named(s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user