From b24e8d17b29246cbf520cffc48eac374ec245e48 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Wed, 10 Jan 2024 19:08:51 -0500
Subject: [PATCH] Increase minimum CUDA memory allocation overhead and fix
 minimum overhead for multi-gpu (#1896)

* increase minimum cuda overhead and fix minimum overhead for multi-gpu

* fix multi gpu overhead

* limit overhead to 10% of all gpus

* better wording

* allocate fixed amount before layers

* fixed only includes graph alloc
---
 gpu/gpu.go |  9 +++++----
 llm/llm.go | 22 ++++++++++++++--------
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/gpu/gpu.go b/gpu/gpu.go
index 57b04da1..b51dc9e9 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -184,10 +184,11 @@ func getCPUMem() (memInfo, error) {
 func CheckVRAM() (int64, error) {
 	gpuInfo := GetGPUInfo()
 	if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
-		// leave 10% or 384Mi of VRAM free for unaccounted for overhead
-		overhead := gpuInfo.FreeMemory * uint64(gpuInfo.DeviceCount) / 10
-		if overhead < 384*1024*1024 {
-			overhead = 384 * 1024 * 1024
+		// leave 10% or 512MiB of VRAM free per GPU to handle unaccounted for overhead
+		overhead := gpuInfo.FreeMemory / 10
+		gpus := uint64(gpuInfo.DeviceCount)
+		if overhead < gpus*512*1024*1024 {
+			overhead = gpus * 512 * 1024 * 1024
 		}
 		return int64(gpuInfo.FreeMemory - overhead), nil
 	}
diff --git a/llm/llm.go b/llm/llm.go
index 023077aa..940c0d93 100644
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -95,20 +95,26 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
 				break
 			}
 
-			// no offloading required
-			if requiredTotal <= available {
-				break
-			}
-
-			// requiredAlloc is always loaded for the CUDA runner, so don't load it if it won't fit
-			if requiredAlloc > available {
+			// alloc buffer and kv cache is allocated as a fixed amount on the main gpu
+			// TODO: find the largest GPU and only reserve memory there
+			avgAvailable := available / int64(info.DeviceCount)
+			if requiredAlloc > avgAvailable {
 				log.Printf("not enough vram available, falling back to CPU only")
 				library = "cpu"
 				opts.NumGPU = 0
 				break
 			}
 
-			available -= requiredAlloc
+			// we don't know which GPU will be used, so estimate
+			// the scratch buffer space on all of them
+			// TODO: allocate less layers to the GPU with the scratch buffer
+			// and more to the others (based on their available memory)
+			available -= requiredAlloc * int64(info.DeviceCount)
+
+			// no offloading required
+			if requiredModel+requiredKv <= available {
+				break
+			}
 
 			// fill remaining vram with layers
 			log.Println("splitting", available, "of available memory bytes into layers")