From fc37c192ae9d7244090107b0d5a6442440d194db Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 3 Jun 2024 19:09:23 -0700 Subject: [PATCH] Refine CPU load behavior with system memory visibility --- gpu/gpu.go | 96 ++++++++++++++++++++++++++++++++++++++------ gpu/gpu_darwin.go | 11 +++++ gpu/gpu_info_cpu.c | 6 +-- gpu/gpu_info_nvml.c | 12 +----- llm/server.go | 37 +++++++++++------ server/sched.go | 87 +++++++++++++++++++++++++-------------- server/sched_test.go | 60 +++++++++++++++------------ 7 files changed, 211 insertions(+), 98 deletions(-) diff --git a/gpu/gpu.go b/gpu/gpu.go index 988c20af..9a01c363 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -11,6 +11,8 @@ package gpu */ import "C" import ( + "bufio" + "bytes" "fmt" "log/slog" "os" @@ -246,6 +248,17 @@ func initOneAPIHandles() *oneapiHandles { return oHandles } +func GetCPUInfo() GpuInfoList { + gpuMutex.Lock() + if !bootstrapped { + gpuMutex.Unlock() + GetGPUInfo() + } else { + gpuMutex.Unlock() + } + return GpuInfoList{cpus[0].GpuInfo} +} + func GetGPUInfo() GpuInfoList { // TODO - consider exploring lspci (and equivalent on windows) to check for // GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries @@ -279,22 +292,19 @@ func GetGPUInfo() GpuInfoList { needRefresh = false cpuCapability = getCPUCapability() var memInfo C.mem_info_t - C.cpu_check_ram(&memInfo) - if memInfo.err != nil { - slog.Info("error looking up CPU memory", "error", C.GoString(memInfo.err)) - C.free(unsafe.Pointer(memInfo.err)) - return []GpuInfo{} + + mem, err := GetCPUMem() + if err != nil { + slog.Warn("error looking up system memory", "error", err) } - cpuInfo := CPUInfo{ + cpus = []CPUInfo{CPUInfo{ GpuInfo: GpuInfo{ + memInfo: mem, Library: "cpu", Variant: cpuCapability.ToVariant(), + ID: "0", }, - } - cpuInfo.TotalMemory = uint64(memInfo.total) - cpuInfo.FreeMemory = uint64(memInfo.free) - cpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) - cpus = []CPUInfo{cpuInfo} + }} // Fallback to CPU mode if we're lacking required vector extensions on x86 if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" { @@ -394,7 +404,25 @@ func GetGPUInfo() GpuInfoList { // Refresh free memory usage if needRefresh { - // TODO - CPU system memory tracking/refresh + mem, err := GetCPUMem() + if err != nil { + slog.Warn("error looking up system memory", "error", err) + } else { + slog.Debug("updating system memory data", + slog.Group( + "before", + "total", format.HumanBytes2(cpus[0].TotalMemory), + "free", format.HumanBytes2(cpus[0].FreeMemory), + ), + slog.Group( + "now", + "total", format.HumanBytes2(mem.TotalMemory), + "free", format.HumanBytes2(mem.FreeMemory), + ), + ) + cpus[0].FreeMemory = mem.FreeMemory + } + var memInfo C.mem_info_t if cHandles == nil && len(cudaGPUs) > 0 { cHandles = initCudaHandles() @@ -455,7 +483,7 @@ func GetGPUInfo() GpuInfoList { oneapiGPUs[i].FreeMemory = uint64(memInfo.free) } - err := RocmGPUInfoList(rocmGPUs).RefreshFreeMemory() + err = RocmGPUInfoList(rocmGPUs).RefreshFreeMemory() if err != nil { slog.Debug("problem refreshing ROCm free memory", "error", err) } @@ -478,6 +506,9 @@ func GetGPUInfo() GpuInfoList { } func GetCPUMem() (memInfo, error) { + if runtime.GOOS == "linux" { + return GetLinuxMemInfo() + } var ret memInfo var info C.mem_info_t C.cpu_check_ram(&info) @@ -651,3 +682,42 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) { return "", "" } } + +func GetLinuxMemInfo() (memInfo, error) { + var mem memInfo + var total, available, free, buffers, cached uint64 + f, err := os.Open("/proc/meminfo") + if err != nil { + return mem, err + } + defer f.Close() + s := bufio.NewScanner(f) + for s.Scan() { + switch { + case bytes.HasPrefix(s.Bytes(), []byte(`MemTotal:`)): + _, err = fmt.Sscanf(s.Text(), "MemTotal:%d", &total) + case bytes.HasPrefix(s.Bytes(), []byte(`MemAvailable:`)): + _, err = fmt.Sscanf(s.Text(), "MemAvailable:%d", &available) + case bytes.HasPrefix(s.Bytes(), []byte(`MemFree:`)): + _, err = fmt.Sscanf(s.Text(), "MemFree:%d", &free) + case bytes.HasPrefix(s.Bytes(), []byte(`Buffers:`)): + _, err = fmt.Sscanf(s.Text(), "Buffers:%d", &buffers) + case bytes.HasPrefix(s.Bytes(), []byte(`Cached:`)): + _, err = fmt.Sscanf(s.Text(), "Cached:%d", &cached) + default: + continue + } + if err != nil { + return mem, err + } + + if total > 0 && available > 0 { + mem.TotalMemory = total * 1024 + mem.FreeMemory = available * 1024 + return mem, nil + } + } + mem.TotalMemory = total * 1024 + mem.FreeMemory = (free + buffers + cached) * 1024 + return mem, nil +} diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index f8cc1adb..d21be410 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -42,6 +42,17 @@ func GetGPUInfo() GpuInfoList { return []GpuInfo{info} } +func GetCPUInfo() GpuInfoList { + mem, _ := GetCPUMem() + return []GpuInfo{ + { + Library: "cpu", + Variant: GetCPUVariant(), + memInfo: mem, + }, + } +} + func GetCPUMem() (memInfo, error) { return memInfo{ TotalMemory: uint64(C.getPhysicalMemory()), diff --git a/gpu/gpu_info_cpu.c b/gpu/gpu_info_cpu.c index 6cbe28b0..f7d849d7 100644 --- a/gpu/gpu_info_cpu.c +++ b/gpu/gpu_info_cpu.c @@ -35,11 +35,7 @@ void cpu_check_ram(mem_info_t *resp) { } #elif __APPLE__ -// TODO consider an Apple implementation that does something useful -// mem_info_t cpu_check_ram() { -// mem_info_t resp = {0, 0, NULL}; -// return resp; -// } +// Unused - see gpu_darwin.go #else #error "Unsupported platform" #endif diff --git a/gpu/gpu_info_nvml.c b/gpu/gpu_info_nvml.c index e152a45c..ef0a97df 100644 --- a/gpu/gpu_info_nvml.c +++ b/gpu/gpu_info_nvml.c @@ -11,8 +11,6 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) { char buf[buflen + 1]; int i; - LOG(1, "XXX starting nvml_init %s\n", nvml_lib_path); - struct lookup { char *s; void **p; @@ -37,13 +35,11 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) { } // TODO once we've squashed the remaining corner cases remove this log -// LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path); + // LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path); - LOG(1, "XXX wiring functions nvml_init\n"); - for (i = 0; l[i].s != NULL; i++) { // TODO once we've squashed the remaining corner cases remove this log - LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s); + // LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s); *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s); if (!l[i].p) { @@ -58,7 +54,6 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) { return; } } - LOG(1, "XXX calling init_v2\n"); ret = (*resp->ch.nvmlInit_v2)(); if (ret != NVML_SUCCESS) { @@ -69,8 +64,6 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) { resp->err = strdup(buf); return; } - LOG(1, "XXX nvml_init done\n"); - } @@ -78,7 +71,6 @@ void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *tot nvmlDevice_t device; nvmlMemory_t memInfo = {0}; nvmlReturn_t ret; - LOG(1, "XXX in nvml_get_free\n"); ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device); if (ret != NVML_SUCCESS) { LOG(1, "unable to get device handle %d: %d", device_id, ret); diff --git a/llm/server.go b/llm/server.go index eb3d6365..089c35d1 100644 --- a/llm/server.go +++ b/llm/server.go @@ -37,8 +37,9 @@ type LlamaServer interface { Tokenize(ctx context.Context, content string) ([]int, error) Detokenize(ctx context.Context, tokens []int) (string, error) Close() error - EstimatedVRAM() uint64 + EstimatedVRAM() uint64 // Total VRAM across all GPUs EstimatedTotal() uint64 + EstimagedVRAMByGPU(gpuID string) uint64 } // llmServer is an instance of the llama.cpp server @@ -49,10 +50,11 @@ type llmServer struct { status *StatusWriter options api.Options - estimate MemoryEstimate - totalLayers uint64 - gpuCount int - loadDuration time.Duration // Record how long it took the model to load + estimate MemoryEstimate + totalLayers uint64 + // gpuCount int + gpus gpu.GpuInfoList // Recorded just before the model loaded, free space will be incorrect + loadDuration time.Duration // Record how long it took the model to load loadProgress float32 sem *semaphore.Weighted @@ -80,12 +82,13 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr var cpuRunner string var estimate MemoryEstimate var systemMemory uint64 - gpuCount := len(gpus) - if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 { - // TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner + // If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info + if opts.NumGPU == 0 { + gpus = gpu.GetCPUInfo() + } + if len(gpus) == 1 && gpus[0].Library == "cpu" { cpuRunner = serverForCpu() - gpuCount = 0 estimate = EstimateGPULayers(gpus, ggml, projectors, opts) } else { if gpus[0].Library == "metal" { @@ -107,7 +110,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr case gpus[0].Library != "metal" && estimate.Layers == 0: // Don't bother loading into the GPU if no layers can fit cpuRunner = serverForCpu() - gpuCount = 0 + gpus = gpu.GetCPUInfo() case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu": opts.NumGPU = estimate.Layers } @@ -246,8 +249,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr } if strings.HasPrefix(servers[i], "cpu") { - // TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up - gpuCount = 0 + gpus = gpu.GetCPUInfo() } // Find an availableServers port, retry on each iteration in case the failure was a port conflict race @@ -310,7 +312,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr estimate: estimate, sem: semaphore.NewWeighted(int64(numParallel)), totalLayers: ggml.KV().BlockCount() + 1, - gpuCount: gpuCount, + gpus: gpus, done: make(chan error, 1), } @@ -1014,6 +1016,15 @@ func (s *llmServer) EstimatedTotal() uint64 { return s.estimate.TotalSize } +func (s *llmServer) EstimagedVRAMByGPU(gpuID string) uint64 { + for i, gpu := range s.gpus { + if gpu.ID == gpuID { + return s.estimate.GPUSizes[i] + } + } + return 0 +} + func parseDurationMs(ms float64) time.Duration { dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms)) if err != nil { diff --git a/server/sched.go b/server/sched.go index f6f12a67..d380c9d4 100644 --- a/server/sched.go +++ b/server/sched.go @@ -7,7 +7,6 @@ import ( "log/slog" "reflect" "runtime" - "slices" "sort" "strings" "sync" @@ -41,6 +40,7 @@ type Scheduler struct { loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) getGpuFn func() gpu.GpuInfoList + getCpuFn func() gpu.GpuInfoList } var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded") @@ -54,6 +54,7 @@ func InitScheduler(ctx context.Context) *Scheduler { loaded: make(map[string]*runnerRef), newServerFn: llm.NewLlamaServer, getGpuFn: gpu.GetGPUInfo, + getCpuFn: gpu.GetCPUInfo, } sched.loadFn = sched.load return sched @@ -131,7 +132,12 @@ func (s *Scheduler) processPending(ctx context.Context) { } else { // Either no models are loaded or below envconfig.MaxRunners // Get a refreshed GPU list - gpus := s.getGpuFn() + var gpus gpu.GpuInfoList + if pending.opts.NumGPU == 0 { + gpus = s.getCpuFn() + } else { + gpus = s.getGpuFn() + } // Load model for fitting ggml, err := llm.LoadModel(pending.model.ModelPath) @@ -140,16 +146,22 @@ func (s *Scheduler) processPending(ctx context.Context) { break } - // If we're CPU only mode, just limit by envconfig.MaxRunners above - // TODO handle system memory exhaustion - if (len(gpus) == 1 && gpus[0].Library == "cpu") || pending.opts.NumGPU == 0 { - slog.Debug("cpu mode with existing models, loading") - s.loadFn(pending, ggml, gpus) - break - } - - // No models loaded. Load the model but prefer the best fit. - if loadedCount == 0 { + // Evaluate if the model will fit in the available system memory, or if we should unload a model first + if len(gpus) == 1 && gpus[0].Library == "cpu" { + if loadedCount == 0 { + slog.Debug("cpu mode with first model, loading") + s.loadFn(pending, ggml, gpus) + break + } + runnerToExpire = s.maybeFindCPURunnerToUnload(pending, ggml, gpus) + if runnerToExpire == nil { + slog.Debug("cpu mode with available system memory or first model, loading") + s.loadFn(pending, ggml, gpus) + break + } + // else we need to expire a runner + } else if loadedCount == 0 { + // No models loaded. Load the model but prefer the best fit. slog.Debug("loading first model", "model", pending.model.ModelPath) g := pickBestFitGPUs(pending, ggml, gpus) if g != nil { @@ -159,16 +171,18 @@ func (s *Scheduler) processPending(ctx context.Context) { break } - // More than one loaded model, so we have to see if the new one fits - // Update free memory from currently loaded models - s.updateFreeSpace(gpus) - gpus = pickBestFitGPUs(pending, ggml, gpus) - if gpus != nil { - slog.Debug("new model fits with existing models, loading") - s.loadFn(pending, ggml, gpus) - break + if runnerToExpire == nil { + // More than one loaded model, so we have to see if the new one fits + // Update free memory from currently loaded models + s.updateFreeSpace(gpus) + gpus = pickBestFitGPUs(pending, ggml, gpus) + if gpus != nil { + slog.Debug("new model fits with existing models, loading") + s.loadFn(pending, ggml, gpus) + break + } + runnerToExpire = s.findRunnerToUnload() } - runnerToExpire = s.findRunnerToUnload() } if runnerToExpire == nil { @@ -368,17 +382,11 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) { s.loadedMu.Lock() for _, r := range s.loaded { r.refMu.Lock() - gpuIDs := make([]string, 0, len(r.gpus)) if r.llama != nil { - // TODO this should be broken down by GPU instead of assuming uniform spread - estimatedVRAMPerGPU := r.llama.EstimatedVRAM() / uint64(len(r.gpus)) - for _, gpu := range r.gpus { - gpuIDs = append(gpuIDs, gpu.ID) - } for _, gpu := range allGpus { - if slices.Contains(gpuIDs, gpu.ID) { - predMap[predKey{gpu.Library, gpu.ID}] += estimatedVRAMPerGPU - } + // if slices.Contains(gpuIDs, gpu.ID) { + predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimagedVRAMByGPU(gpu.ID) + // } } } else { slog.Warn("unexpected nil runner reference, memory prediction may be incorrect") @@ -489,7 +497,8 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} { // CPU or Metal don't need checking, so no waiting required // windows can page VRAM, only cuda currently can report accurate used vram usage - if (len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) || + if len(runner.gpus) == 0 || + (len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) || (runtime.GOOS == "windows" && runner.gpus[0].Library != "cuda") { finished <- struct{}{} return finished @@ -624,3 +633,19 @@ func (s *Scheduler) unloadAllRunners() { } } } + +// If other runners are loaded, make sure the pending request will fit in system memory +// If not, pick a runner to unload, else return nil and the request can be loaded +func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef { + slog.Debug("evaluating if CPU model load will fit in available system memory") + estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts) + if estimate.TotalSize <= gpus[0].FreeMemory { + slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory)) + return nil + } + + // TODO - optimization: try to find CPU only runners first, or partial offloads with enough in system memory to make room + + return s.findRunnerToUnload() + +} diff --git a/server/sched_test.go b/server/sched_test.go index e7ea5874..639da5f9 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -60,7 +60,7 @@ func TestLoad(t *testing.T) { err := <-req.errCh require.Contains(t, err.Error(), "this model may be incompatible") - server := &mockLlm{estimatedVRAM: 10} + server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}} s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) { return server, nil } @@ -146,7 +146,7 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV successCh: make(chan *runnerRef, 1), errCh: make(chan error, 1), } - scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM} + scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}} return scenario } @@ -182,6 +182,12 @@ func TestRequests(t *testing.T) { g.FreeMemory = 12 * format.GigaByte return []gpu.GpuInfo{g} } + s.getCpuFn = func() gpu.GpuInfoList { + g := gpu.GpuInfo{Library: "cpu"} + g.TotalMemory = 32 * format.GigaByte + g.FreeMemory = 26 * format.GigaByte + return []gpu.GpuInfo{g} + } s.newServerFn = scenario1a.newServer slog.Info("scenario1a") s.pendingReqCh <- scenario1a.req @@ -420,7 +426,7 @@ func TestUseLoadedRunner(t *testing.T) { sessionDuration: 2, } finished := make(chan *LlmRequest) - llm1 := &mockLlm{} + llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}} r1 := &runnerRef{llama: llm1, sessionDuration: 1} req.useLoadedRunner(r1, finished) require.Equal(t, uint(1), r1.refCount) @@ -453,8 +459,8 @@ func TestUpdateFreeSpace(t *testing.T) { gpus[0].FreeMemory = 900 gpus[1].TotalMemory = 2000 gpus[1].FreeMemory = 1900 - llm1 := &mockLlm{estimatedVRAM: 100} - llm2 := &mockLlm{estimatedVRAM: 200} + llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}} + llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}} r1 := &runnerRef{llama: llm1, gpus: gpus} r2 := &runnerRef{llama: llm2, gpus: gpus} @@ -465,8 +471,8 @@ func TestUpdateFreeSpace(t *testing.T) { s.loadedMu.Unlock() s.updateFreeSpace(gpus) - require.Equal(t, uint64(850), gpus[0].FreeMemory) - require.Equal(t, uint64(1850), gpus[1].FreeMemory) + require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory) + require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory) } func TestFindRunnerToUnload(t *testing.T) { @@ -493,7 +499,7 @@ func TestNeedsReload(t *testing.T) { ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) defer done() - llm := &mockLlm{} + llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}} do := api.DefaultOptions() runner := &runnerRef{ model: &Model{AdapterPaths: []string{"adapter1"}, ProjectorPaths: []string{"projector1"}}, @@ -536,8 +542,8 @@ func TestUnloadAllRunners(t *testing.T) { ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) defer done() - llm1 := &mockLlm{} - llm2 := &mockLlm{} + llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}} + llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}} s := InitScheduler(ctx) s.unloadAllRunners() @@ -555,7 +561,7 @@ func TestUnloadAllRunners(t *testing.T) { } func TestUnload(t *testing.T) { - llm1 := &mockLlm{} + llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}} r1 := &runnerRef{llama: llm1} r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}} r1.unload() @@ -565,19 +571,20 @@ func TestUnload(t *testing.T) { } type mockLlm struct { - pingResp error - waitResp error - completionResp error - embeddingResp []float64 - embeddingRespErr error - tokenizeResp []int - tokenizeRespErr error - detokenizeResp string - detonekizeRespErr error - closeResp error - closeCalled bool - estimatedVRAM uint64 - estimatedTotal uint64 + pingResp error + waitResp error + completionResp error + embeddingResp []float64 + embeddingRespErr error + tokenizeResp []int + tokenizeRespErr error + detokenizeResp string + detonekizeRespErr error + closeResp error + closeCalled bool + estimatedVRAM uint64 + estimatedTotal uint64 + estimatedVRAMByGPU map[string]uint64 } func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp } @@ -598,5 +605,6 @@ func (s *mockLlm) Close() error { s.closeCalled = true return s.closeResp } -func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM } -func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal } +func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM } +func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal } +func (s *mockLlm) EstimagedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }