Compare commits

..

2 Commits

Author SHA1 Message Date
1401b24c79
Remove mem check 2024-11-14 13:26:13 +01:00
Blake Mizerany
67691e410d
cmd: preserve exact bytes when displaying template/system layers (#7586) 2024-11-13 23:53:30 -08:00
3 changed files with 22 additions and 60 deletions

View File

@ -800,9 +800,9 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
case "parameters":
fmt.Println(resp.Parameters)
case "system":
fmt.Println(resp.System)
fmt.Print(resp.System)
case "template":
fmt.Println(resp.Template)
fmt.Print(resp.Template)
}
return nil

View File

@ -20,8 +20,6 @@ import (
"time"
"unicode/utf8"
"golang.org/x/sync/semaphore"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/llama"
)
@ -205,51 +203,38 @@ func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
}
type Server struct {
// is the server ready to process requests?
// protects access to model and image
ready sync.WaitGroup
// loaded model
model *llama.Model
lc *llama.Context
// image model context for multi-modal models
// required for image embeddings
image *ImageContext
// status for external health reporting - loading, ready to serve, etc.
status ServerStatus
// current progress on loading the model
progress float32
// number of simultaneous requests to handle
parallel int
// maximum number of elements in a batch (per sequence)
// TODO (jmorganca): make this n_batch
batchSize int
// protects access to everything below this line
// this is context state needed for decoding
mu sync.Mutex
// parallel is the number of parallel requests to handle
parallel int
// indicates that data is ready for processing
cond *sync.Cond
// decoding state
lc *llama.Context
// the list of simultaneous sequences being evaluated
// seqs is the list of parallel sequences being evaluated
// TODO (jmorganca): this can probably be moved into run()
seqs []*Sequence
// seqs can have a maximum of parallel entries, which
// is enfoced by seqSem
seqsSem *semaphore.Weighted
// KV cache
cache *InputCache
// next sequence for prompt processing to avoid starvation
nextSeq int
// is the server ready to process requests?
ready sync.WaitGroup
mu sync.Mutex
cond *sync.Cond
progress float32
status ServerStatus
}
func (s *Server) allNil() bool {
@ -624,13 +609,8 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
return
}
// Ensure that a place to put the sequence is available
if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
slog.Error("Failed to acquire semaphore", "error", err)
return
}
defer s.seqsSem.Release(1)
// TODO (jmorganca): add to sequence queue instead of
// failing if a slot isn't available
s.mu.Lock()
for i, sq := range s.seqs {
if sq == nil {
@ -713,13 +693,7 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
return
}
// Ensure that a place to put the sequence is available
if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
slog.Error("Failed to acquire semaphore", "error", err)
return
}
defer s.seqsSem.Release(1)
// TODO (jessegross): Wait for a free slot instead of failing and blocking forever
s.mu.Lock()
for i, sq := range s.seqs {
if sq == nil {
@ -874,7 +848,6 @@ func main() {
batchSize: *batchSize,
parallel: *parallel,
seqs: make([]*Sequence, *parallel),
seqsSem: semaphore.NewWeighted(int64(*parallel)),
status: ServerStatusLoadingModel,
}

View File

@ -128,17 +128,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
}
}
// On linux and windows, over-allocating CPU memory will almost always result in an error
// Darwin has fully dynamic swap so has no direct concept of free swap space
if runtime.GOOS != "darwin" {
systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize
available := systemFreeMemory + systemSwapFreeMemory
if systemMemoryRequired > available {
slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", available, "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "swap", format.HumanBytes2(systemSwapFreeMemory))
return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
}
}
estimate.log()
// Loop through potential servers