Compare commits

..

2 Commits

Author SHA1 Message Date
1401b24c79
Remove mem check 2024-11-14 13:26:13 +01:00
Blake Mizerany
67691e410d
cmd: preserve exact bytes when displaying template/system layers (#7586) 2024-11-13 23:53:30 -08:00
3 changed files with 22 additions and 60 deletions

View File

@ -800,9 +800,9 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
case "parameters": case "parameters":
fmt.Println(resp.Parameters) fmt.Println(resp.Parameters)
case "system": case "system":
fmt.Println(resp.System) fmt.Print(resp.System)
case "template": case "template":
fmt.Println(resp.Template) fmt.Print(resp.Template)
} }
return nil return nil

View File

@ -20,8 +20,6 @@ import (
"time" "time"
"unicode/utf8" "unicode/utf8"
"golang.org/x/sync/semaphore"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/llama" "github.com/ollama/ollama/llama"
) )
@ -205,51 +203,38 @@ func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
} }
type Server struct { type Server struct {
// is the server ready to process requests?
// protects access to model and image
ready sync.WaitGroup
// loaded model
model *llama.Model model *llama.Model
lc *llama.Context
// image model context for multi-modal models // required for image embeddings
image *ImageContext image *ImageContext
// status for external health reporting - loading, ready to serve, etc.
status ServerStatus
// current progress on loading the model
progress float32
// number of simultaneous requests to handle
parallel int
// maximum number of elements in a batch (per sequence)
// TODO (jmorganca): make this n_batch // TODO (jmorganca): make this n_batch
batchSize int batchSize int
// protects access to everything below this line // parallel is the number of parallel requests to handle
// this is context state needed for decoding parallel int
mu sync.Mutex
// indicates that data is ready for processing // seqs is the list of parallel sequences being evaluated
cond *sync.Cond // TODO (jmorganca): this can probably be moved into run()
// decoding state
lc *llama.Context
// the list of simultaneous sequences being evaluated
seqs []*Sequence seqs []*Sequence
// seqs can have a maximum of parallel entries, which
// is enfoced by seqSem
seqsSem *semaphore.Weighted
// KV cache // KV cache
cache *InputCache cache *InputCache
// next sequence for prompt processing to avoid starvation // next sequence for prompt processing to avoid starvation
nextSeq int nextSeq int
// is the server ready to process requests?
ready sync.WaitGroup
mu sync.Mutex
cond *sync.Cond
progress float32
status ServerStatus
} }
func (s *Server) allNil() bool { func (s *Server) allNil() bool {
@ -624,13 +609,8 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
return return
} }
// Ensure that a place to put the sequence is available // TODO (jmorganca): add to sequence queue instead of
if err := s.seqsSem.Acquire(r.Context(), 1); err != nil { // failing if a slot isn't available
slog.Error("Failed to acquire semaphore", "error", err)
return
}
defer s.seqsSem.Release(1)
s.mu.Lock() s.mu.Lock()
for i, sq := range s.seqs { for i, sq := range s.seqs {
if sq == nil { if sq == nil {
@ -713,13 +693,7 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
return return
} }
// Ensure that a place to put the sequence is available // TODO (jessegross): Wait for a free slot instead of failing and blocking forever
if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
slog.Error("Failed to acquire semaphore", "error", err)
return
}
defer s.seqsSem.Release(1)
s.mu.Lock() s.mu.Lock()
for i, sq := range s.seqs { for i, sq := range s.seqs {
if sq == nil { if sq == nil {
@ -874,7 +848,6 @@ func main() {
batchSize: *batchSize, batchSize: *batchSize,
parallel: *parallel, parallel: *parallel,
seqs: make([]*Sequence, *parallel), seqs: make([]*Sequence, *parallel),
seqsSem: semaphore.NewWeighted(int64(*parallel)),
status: ServerStatusLoadingModel, status: ServerStatusLoadingModel,
} }

View File

@ -128,17 +128,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
} }
} }
// On linux and windows, over-allocating CPU memory will almost always result in an error
// Darwin has fully dynamic swap so has no direct concept of free swap space
if runtime.GOOS != "darwin" {
systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize
available := systemFreeMemory + systemSwapFreeMemory
if systemMemoryRequired > available {
slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", available, "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "swap", format.HumanBytes2(systemSwapFreeMemory))
return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
}
}
estimate.log() estimate.log()
// Loop through potential servers // Loop through potential servers