Remove mem check

cmd: preserve exact bytes when displaying template/system layers (#7586 )
2024-11-14 13:26:13 +01:00 · 2024-11-13 23:53:30 -08:00
3 changed files with 22 additions and 60 deletions
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@ -800,9 +800,9 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 		case "parameters":
 			fmt.Println(resp.Parameters)
 		case "system":
-			fmt.Println(resp.System)
+			fmt.Print(resp.System)
 		case "template":
-			fmt.Println(resp.Template)
+			fmt.Print(resp.Template)
 		}

 		return nil
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@ -20,8 +20,6 @@ import (
 	"time"
 	"unicode/utf8"

-	"golang.org/x/sync/semaphore"
-
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/llama"
 )
@ -205,51 +203,38 @@ func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
 }

 type Server struct {
-	// is the server ready to process requests?
-	// protects access to model and image
-	ready sync.WaitGroup
-
-	// loaded model
 	model *llama.Model
+	lc    *llama.Context

-	// image model context for multi-modal models
+	// required for image embeddings
 	image *ImageContext

-	// status for external health reporting - loading, ready to serve, etc.
-	status ServerStatus
-
-	// current progress on loading the model
-	progress float32
-
-	// number of simultaneous requests to handle
-	parallel int
-
-	// maximum number of elements in a batch (per sequence)
 	// TODO (jmorganca): make this n_batch
 	batchSize int

-	// protects access to everything below this line
-	// this is context state needed for decoding
-	mu sync.Mutex
+	// parallel is the number of parallel requests to handle
+	parallel int

-	// indicates that data is ready for processing
-	cond *sync.Cond
-
-	// decoding state
-	lc *llama.Context
-
-	// the list of simultaneous sequences being evaluated
+	// seqs is the list of parallel sequences being evaluated
+	// TODO (jmorganca): this can probably be moved into run()
 	seqs []*Sequence

-	// seqs can have a maximum of parallel entries, which
-	// is enfoced by seqSem
-	seqsSem *semaphore.Weighted
-
 	// KV cache
 	cache *InputCache

 	// next sequence for prompt processing to avoid starvation
 	nextSeq int
+
+	// is the server ready to process requests?
+	ready sync.WaitGroup
+
+	mu sync.Mutex
+
+	cond *sync.Cond
+
+	progress float32
+
+	status ServerStatus
 }

 func (s *Server) allNil() bool {
@ -624,13 +609,8 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		return
 	}

-	// Ensure that a place to put the sequence is available
-	if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
-		slog.Error("Failed to acquire semaphore", "error", err)
-		return
-	}
-	defer s.seqsSem.Release(1)
-
+	// TODO (jmorganca): add to sequence queue instead of
+	// failing if a slot isn't available
 	s.mu.Lock()
 	for i, sq := range s.seqs {
 		if sq == nil {
@ -713,13 +693,7 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
 		return
 	}

-	// Ensure that a place to put the sequence is available
-	if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
-		slog.Error("Failed to acquire semaphore", "error", err)
-		return
-	}
-	defer s.seqsSem.Release(1)
-
+	// TODO (jessegross): Wait for a free slot instead of failing and blocking forever
 	s.mu.Lock()
 	for i, sq := range s.seqs {
 		if sq == nil {
@ -874,7 +848,6 @@ func main() {
 		batchSize: *batchSize,
 		parallel:  *parallel,
 		seqs:      make([]*Sequence, *parallel),
-		seqsSem:   semaphore.NewWeighted(int64(*parallel)),
 		status:    ServerStatusLoadingModel,
 	}

--- a/llm/server.go
+++ b/llm/server.go
@ -128,17 +128,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		}
 	}

-	// On linux and windows, over-allocating CPU memory will almost always result in an error
-	// Darwin has fully dynamic swap so has no direct concept of free swap space
-	if runtime.GOOS != "darwin" {
-		systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize
-		available := systemFreeMemory + systemSwapFreeMemory
-		if systemMemoryRequired > available {
-			slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", available, "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "swap", format.HumanBytes2(systemSwapFreeMemory))
-			return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
-		}
-	}
-
 	estimate.log()

 	// Loop through potential servers
Author	SHA1	Message	Date
norohind	1401b24c79	Remove mem check	2024-11-14 13:26:13 +01:00
Blake Mizerany	67691e410d	cmd: preserve exact bytes when displaying template/system layers (#7586 )	2024-11-13 23:53:30 -08:00