forked from third-party-mirrors/ollama
gpu: read memory info from all cuda devices (#1802)
* gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments
This commit is contained in:
parent
3367b5f3df
commit
df32537312
@ -20,6 +20,8 @@ const char *cuda_lib_paths[] = {
|
|||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define CUDA_LOOKUP_SIZE 5
|
||||||
|
|
||||||
void cuda_init(cuda_init_resp_t *resp) {
|
void cuda_init(cuda_init_resp_t *resp) {
|
||||||
nvmlReturn_t ret;
|
nvmlReturn_t ret;
|
||||||
resp->err = NULL;
|
resp->err = NULL;
|
||||||
@ -30,11 +32,12 @@ void cuda_init(cuda_init_resp_t *resp) {
|
|||||||
struct lookup {
|
struct lookup {
|
||||||
char *s;
|
char *s;
|
||||||
void **p;
|
void **p;
|
||||||
} l[4] = {
|
} l[CUDA_LOOKUP_SIZE] = {
|
||||||
{"nvmlInit_v2", (void *)&resp->ch.initFn},
|
{"nvmlInit_v2", (void *)&resp->ch.initFn},
|
||||||
{"nvmlShutdown", (void *)&resp->ch.shutdownFn},
|
{"nvmlShutdown", (void *)&resp->ch.shutdownFn},
|
||||||
{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
|
{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
|
||||||
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
|
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
|
||||||
|
{"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
|
||||||
};
|
};
|
||||||
|
|
||||||
for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
|
for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
|
||||||
@ -52,7 +55,7 @@ void cuda_init(cuda_init_resp_t *resp) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0; i < 4; i++) { // TODO - fix this to use a null terminated list
|
for (i = 0; i < CUDA_LOOKUP_SIZE; i++) { // TODO - fix this to use a null terminated list
|
||||||
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
|
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
|
||||||
if (!l[i].p) {
|
if (!l[i].p) {
|
||||||
UNLOAD_LIBRARY(resp->ch.handle);
|
UNLOAD_LIBRARY(resp->ch.handle);
|
||||||
@ -89,22 +92,34 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO - handle multiple GPUs
|
unsigned int devices;
|
||||||
ret = (*h.getHandle)(0, &device);
|
ret = (*h.getCount)(&devices);
|
||||||
if (ret != NVML_SUCCESS) {
|
if (ret != NVML_SUCCESS) {
|
||||||
snprintf(buf, buflen, "unable to get device handle: %d", ret);
|
snprintf(buf, buflen, "unable to get device count: %d", ret);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = (*h.getMemInfo)(device, &memInfo);
|
resp->total = 0;
|
||||||
if (ret != NVML_SUCCESS) {
|
resp->free = 0;
|
||||||
snprintf(buf, buflen, "device memory info lookup failure: %d", ret);
|
|
||||||
resp->err = strdup(buf);
|
for (i = 0; i < devices; i++) {
|
||||||
return;
|
ret = (*h.getHandle)(i, &device);
|
||||||
|
if (ret != NVML_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = (*h.getMemInfo)(device, &memInfo);
|
||||||
|
if (ret != NVML_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
resp->total += memInfo.total;
|
||||||
|
resp->free += memInfo.free;
|
||||||
}
|
}
|
||||||
resp->total = memInfo.total;
|
|
||||||
resp->free = memInfo.free;
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
#endif // __APPLE__
|
#endif // __APPLE__
|
@ -21,6 +21,7 @@ typedef struct cuda_handle {
|
|||||||
nvmlReturn_t (*shutdownFn)(void);
|
nvmlReturn_t (*shutdownFn)(void);
|
||||||
nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
|
nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
|
||||||
nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
|
nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
|
||||||
|
nvmlReturn_t (*getCount)(unsigned int *);
|
||||||
} cuda_handle_t;
|
} cuda_handle_t;
|
||||||
|
|
||||||
typedef struct cuda_init_resp {
|
typedef struct cuda_init_resp {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user