diff --git a/gpu/gpu.go b/gpu/gpu.go index 57b04da1..b51dc9e9 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -184,10 +184,11 @@ func getCPUMem() (memInfo, error) { func CheckVRAM() (int64, error) { gpuInfo := GetGPUInfo() if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") { - // leave 10% or 384Mi of VRAM free for unaccounted for overhead - overhead := gpuInfo.FreeMemory * uint64(gpuInfo.DeviceCount) / 10 - if overhead < 384*1024*1024 { - overhead = 384 * 1024 * 1024 + // leave 10% or 512MiB of VRAM free per GPU to handle unaccounted for overhead + overhead := gpuInfo.FreeMemory / 10 + gpus := uint64(gpuInfo.DeviceCount) + if overhead < gpus*512*1024*1024 { + overhead = gpus * 512 * 1024 * 1024 } return int64(gpuInfo.FreeMemory - overhead), nil } diff --git a/llm/llm.go b/llm/llm.go index 023077aa..940c0d93 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -95,20 +95,26 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) break } - // no offloading required - if requiredTotal <= available { - break - } - - // requiredAlloc is always loaded for the CUDA runner, so don't load it if it won't fit - if requiredAlloc > available { + // alloc buffer and kv cache is allocated as a fixed amount on the main gpu + // TODO: find the largest GPU and only reserve memory there + avgAvailable := available / int64(info.DeviceCount) + if requiredAlloc > avgAvailable { log.Printf("not enough vram available, falling back to CPU only") library = "cpu" opts.NumGPU = 0 break } - available -= requiredAlloc + // we don't know which GPU will be used, so estimate + // the scratch buffer space on all of them + // TODO: allocate less layers to the GPU with the scratch buffer + // and more to the others (based on their available memory) + available -= requiredAlloc * int64(info.DeviceCount) + + // no offloading required + if requiredModel+requiredKv <= available { + break + } // fill remaining vram with layers log.Println("splitting", available, "of available memory bytes into layers")