Remove mem check

cmd: preserve exact bytes when displaying template/system layers (#7586 )
2024-11-14 13:26:13 +01:00 · 2024-11-13 23:53:30 -08:00
3 changed files with 22 additions and 60 deletions
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@ -800,9 +800,9 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 		case "parameters":
 			fmt.Println(resp.Parameters)
 		case "system":
-			fmt.Println(resp.System)
+			fmt.Print(resp.System)
 		case "template":
-			fmt.Println(resp.Template)
+			fmt.Print(resp.Template)
 		}
 		return nil
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@ -20,8 +20,6 @@ import (
 	"time"
 	"unicode/utf8"
 	"golang.org/x/sync/semaphore"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/llama"
 )
@ -205,51 +203,38 @@ func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
 }
 type Server struct {
 	// is the server ready to process requests?
 	// protects access to model and image
 	ready sync.WaitGroup
 	// loaded model
 	model *llama.Model
 	lc    *llama.Context
-	// image model context for multi-modal models
+	// required for image embeddings
 	image *ImageContext
 	// status for external health reporting - loading, ready to serve, etc.
 	status ServerStatus
 	// current progress on loading the model
 	progress float32
 	// number of simultaneous requests to handle
 	parallel int
 	// maximum number of elements in a batch (per sequence)
 	// TODO (jmorganca): make this n_batch
 	batchSize int
-	// protects access to everything below this line
+	// parallel is the number of parallel requests to handle
-	// this is context state needed for decoding
+	parallel int
 	mu sync.Mutex
-	// indicates that data is ready for processing
+	// seqs is the list of parallel sequences being evaluated
-	cond *sync.Cond
+	// TODO (jmorganca): this can probably be moved into run()
 	// decoding state
 	lc *llama.Context
 	// the list of simultaneous sequences being evaluated
 	seqs []*Sequence
 	// seqs can have a maximum of parallel entries, which
 	// is enfoced by seqSem
 	seqsSem *semaphore.Weighted
 	// KV cache
 	cache *InputCache
 	// next sequence for prompt processing to avoid starvation
 	nextSeq int
 	// is the server ready to process requests?
 	ready sync.WaitGroup
 	mu sync.Mutex
 	cond *sync.Cond
 	progress float32
 	status ServerStatus
 }
 func (s *Server) allNil() bool {
@ -624,13 +609,8 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		return
 	}
-	// Ensure that a place to put the sequence is available
+	// TODO (jmorganca): add to sequence queue instead of
-	if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
+	// failing if a slot isn't available
 		slog.Error("Failed to acquire semaphore", "error", err)
 		return
 	}
 	defer s.seqsSem.Release(1)
 	s.mu.Lock()
 	for i, sq := range s.seqs {
 		if sq == nil {
@ -713,13 +693,7 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
 		return
 	}
-	// Ensure that a place to put the sequence is available
+	// TODO (jessegross): Wait for a free slot instead of failing and blocking forever
 	if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
 		slog.Error("Failed to acquire semaphore", "error", err)
 		return
 	}
 	defer s.seqsSem.Release(1)
 	s.mu.Lock()
 	for i, sq := range s.seqs {
 		if sq == nil {
@ -874,7 +848,6 @@ func main() {
 		batchSize: *batchSize,
 		parallel:  *parallel,
 		seqs:      make([]*Sequence, *parallel),
 		seqsSem:   semaphore.NewWeighted(int64(*parallel)),
 		status:    ServerStatusLoadingModel,
 	}
--- a/llm/server.go
+++ b/llm/server.go
@ -128,17 +128,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		}
 	}
 	// On linux and windows, over-allocating CPU memory will almost always result in an error
 	// Darwin has fully dynamic swap so has no direct concept of free swap space
 	if runtime.GOOS != "darwin" {
 		systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize
 		available := systemFreeMemory + systemSwapFreeMemory
 		if systemMemoryRequired > available {
 			slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", available, "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "swap", format.HumanBytes2(systemSwapFreeMemory))
 			return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
 		}
 	}
 	estimate.log()
 	// Loop through potential servers
Author	SHA1	Message	Date
norohind	1401b24c79	Remove mem check	2024-11-14 13:26:13 +01:00
Blake Mizerany	67691e410d	cmd: preserve exact bytes when displaying template/system layers (#7586 )	2024-11-13 23:53:30 -08:00