From b24e8d17b29246cbf520cffc48eac374ec245e48 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Wed, 10 Jan 2024 19:08:51 -0500 Subject: [PATCH] Increase minimum CUDA memory allocation overhead and fix minimum overhead for multi-gpu (#1896) * increase minimum cuda overhead and fix minimum overhead for multi-gpu * fix multi gpu overhead * limit overhead to 10% of all gpus * better wording * allocate fixed amount before layers * fixed only includes graph alloc --- gpu/gpu.go | 9 +++++---- llm/llm.go | 22 ++++++++++++++-------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/gpu/gpu.go b/gpu/gpu.go index 57b04da1..b51dc9e9 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -184,10 +184,11 @@ func getCPUMem() (memInfo, error) { func CheckVRAM() (int64, error) { gpuInfo := GetGPUInfo() if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") { - // leave 10% or 384Mi of VRAM free for unaccounted for overhead - overhead := gpuInfo.FreeMemory * uint64(gpuInfo.DeviceCount) / 10 - if overhead < 384*1024*1024 { - overhead = 384 * 1024 * 1024 + // leave 10% or 512MiB of VRAM free per GPU to handle unaccounted for overhead + overhead := gpuInfo.FreeMemory / 10 + gpus := uint64(gpuInfo.DeviceCount) + if overhead < gpus*512*1024*1024 { + overhead = gpus * 512 * 1024 * 1024 } return int64(gpuInfo.FreeMemory - overhead), nil } diff --git a/llm/llm.go b/llm/llm.go index 023077aa..940c0d93 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -95,20 +95,26 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) break } - // no offloading required - if requiredTotal <= available { - break - } - - // requiredAlloc is always loaded for the CUDA runner, so don't load it if it won't fit - if requiredAlloc > available { + // alloc buffer and kv cache is allocated as a fixed amount on the main gpu + // TODO: find the largest GPU and only reserve memory there + avgAvailable := available / int64(info.DeviceCount) + if requiredAlloc > avgAvailable { log.Printf("not enough vram available, falling back to CPU only") library = "cpu" opts.NumGPU = 0 break } - available -= requiredAlloc + // we don't know which GPU will be used, so estimate + // the scratch buffer space on all of them + // TODO: allocate less layers to the GPU with the scratch buffer + // and more to the others (based on their available memory) + available -= requiredAlloc * int64(info.DeviceCount) + + // no offloading required + if requiredModel+requiredKv <= available { + break + } // fill remaining vram with layers log.Println("splitting", available, "of available memory bytes into layers")