Compare commits
1 Commits
main
...
jmorganca/
Author | SHA1 | Date | |
---|---|---|---|
|
bd933c24bc |
@ -1,145 +0,0 @@
|
||||
#include "dyn_ext_server.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef __linux__
|
||||
#include <dlfcn.h>
|
||||
#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
|
||||
#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
|
||||
#define LOAD_ERR() strdup(dlerror())
|
||||
#define UNLOAD_LIBRARY(handle) dlclose(handle)
|
||||
#elif _WIN32
|
||||
#include <windows.h>
|
||||
#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
|
||||
#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
|
||||
#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
|
||||
inline char *LOAD_ERR() {
|
||||
LPSTR messageBuffer = NULL;
|
||||
size_t size = FormatMessageA(
|
||||
FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
|
||||
FORMAT_MESSAGE_IGNORE_INSERTS,
|
||||
NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
|
||||
(LPSTR)&messageBuffer, 0, NULL);
|
||||
char *resp = strdup(messageBuffer);
|
||||
LocalFree(messageBuffer);
|
||||
return resp;
|
||||
}
|
||||
#else
|
||||
#include <dlfcn.h>
|
||||
#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
|
||||
#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
|
||||
#define LOAD_ERR() strdup(dlerror())
|
||||
#define UNLOAD_LIBRARY(handle) dlclose(handle)
|
||||
#endif
|
||||
|
||||
void dyn_init(const char *libPath, struct dynamic_llama_server *s,
|
||||
ext_server_resp_t *err) {
|
||||
int i = 0;
|
||||
struct lookup {
|
||||
char *s;
|
||||
void **p;
|
||||
} l[] = {
|
||||
{"llama_server_init", (void *)&s->llama_server_init},
|
||||
{"llama_server_start", (void *)&s->llama_server_start},
|
||||
{"llama_server_stop", (void *)&s->llama_server_stop},
|
||||
{"llama_server_completion", (void *)&s->llama_server_completion},
|
||||
{"llama_server_completion_next_result",
|
||||
(void *)&s->llama_server_completion_next_result},
|
||||
{"llama_server_completion_cancel",
|
||||
(void *)&s->llama_server_completion_cancel},
|
||||
{"llama_server_release_task_result",
|
||||
(void *)&s->llama_server_release_task_result},
|
||||
{"llama_server_tokenize", (void *)&s->llama_server_tokenize},
|
||||
{"llama_server_detokenize", (void *)&s->llama_server_detokenize},
|
||||
{"llama_server_embedding", (void *)&s->llama_server_embedding},
|
||||
{"llama_server_release_json_resp",
|
||||
(void *)&s->llama_server_release_json_resp},
|
||||
{"", NULL},
|
||||
};
|
||||
|
||||
printf("loading library %s\n", libPath);
|
||||
s->handle = LOAD_LIBRARY(libPath, RTLD_LOCAL|RTLD_NOW);
|
||||
if (!s->handle) {
|
||||
err->id = -1;
|
||||
char *msg = LOAD_ERR();
|
||||
snprintf(err->msg, err->msg_len,
|
||||
"Unable to load dynamic server library: %s", msg);
|
||||
free(msg);
|
||||
return;
|
||||
}
|
||||
|
||||
for (i = 0; l[i].p != NULL; i++) {
|
||||
*l[i].p = LOAD_SYMBOL(s->handle, l[i].s);
|
||||
if (!l[i].p) {
|
||||
UNLOAD_LIBRARY(s->handle);
|
||||
err->id = -1;
|
||||
char *msg = LOAD_ERR();
|
||||
snprintf(err->msg, err->msg_len, "symbol lookup for %s failed: %s",
|
||||
l[i].s, msg);
|
||||
free(msg);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void dyn_llama_server_init(struct dynamic_llama_server s,
|
||||
ext_server_params_t *sparams,
|
||||
ext_server_resp_t *err) {
|
||||
s.llama_server_init(sparams, err);
|
||||
}
|
||||
|
||||
inline void dyn_llama_server_start(struct dynamic_llama_server s) {
|
||||
s.llama_server_start();
|
||||
}
|
||||
|
||||
inline void dyn_llama_server_stop(struct dynamic_llama_server s) {
|
||||
s.llama_server_stop();
|
||||
}
|
||||
|
||||
inline void dyn_llama_server_completion(struct dynamic_llama_server s,
|
||||
const char *json_req,
|
||||
ext_server_resp_t *resp) {
|
||||
s.llama_server_completion(json_req, resp);
|
||||
}
|
||||
|
||||
inline void dyn_llama_server_completion_next_result(
|
||||
struct dynamic_llama_server s, const int task_id,
|
||||
ext_server_task_result_t *result) {
|
||||
s.llama_server_completion_next_result(task_id, result);
|
||||
}
|
||||
|
||||
inline void dyn_llama_server_completion_cancel(
|
||||
struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
|
||||
s.llama_server_completion_cancel(task_id, err);
|
||||
}
|
||||
inline void dyn_llama_server_release_task_result(
|
||||
struct dynamic_llama_server s, ext_server_task_result_t *result) {
|
||||
s.llama_server_release_task_result(result);
|
||||
}
|
||||
|
||||
inline void dyn_llama_server_tokenize(struct dynamic_llama_server s,
|
||||
const char *json_req,
|
||||
char **json_resp,
|
||||
ext_server_resp_t *err) {
|
||||
s.llama_server_tokenize(json_req, json_resp, err);
|
||||
}
|
||||
|
||||
inline void dyn_llama_server_detokenize(struct dynamic_llama_server s,
|
||||
const char *json_req,
|
||||
char **json_resp,
|
||||
ext_server_resp_t *err) {
|
||||
s.llama_server_detokenize(json_req, json_resp, err);
|
||||
}
|
||||
|
||||
inline void dyn_llama_server_embedding(struct dynamic_llama_server s,
|
||||
const char *json_req,
|
||||
char **json_resp,
|
||||
ext_server_resp_t *err) {
|
||||
s.llama_server_embedding(json_req, json_resp, err);
|
||||
}
|
||||
|
||||
inline void dyn_llama_server_release_json_resp(
|
||||
struct dynamic_llama_server s, char **json_resp) {
|
||||
s.llama_server_release_json_resp(json_resp);
|
||||
}
|
@ -1,74 +0,0 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "ext_server.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
struct dynamic_llama_server {
|
||||
void *handle;
|
||||
void (*llama_server_init)(ext_server_params_t *sparams,
|
||||
ext_server_resp_t *err);
|
||||
void (*llama_server_start)();
|
||||
void (*llama_server_stop)();
|
||||
void (*llama_server_completion)(const char *json_req,
|
||||
ext_server_resp_t *resp);
|
||||
void (*llama_server_completion_next_result)(const int task_id,
|
||||
ext_server_task_result_t *result);
|
||||
void (*llama_server_completion_cancel)(const int task_id,
|
||||
ext_server_resp_t *err);
|
||||
void (*llama_server_release_task_result)(ext_server_task_result_t *result);
|
||||
void (*llama_server_tokenize)(const char *json_req, char **json_resp,
|
||||
ext_server_resp_t *err);
|
||||
void (*llama_server_detokenize)(const char *json_req, char **json_resp,
|
||||
ext_server_resp_t *err);
|
||||
void (*llama_server_embedding)(const char *json_req, char **json_resp,
|
||||
ext_server_resp_t *err);
|
||||
void (*llama_server_release_json_resp)(char **json_resp);
|
||||
};
|
||||
|
||||
void dyn_init(const char *libPath, struct dynamic_llama_server *s,
|
||||
ext_server_resp_t *err);
|
||||
|
||||
// No good way to call C function pointers from Go so inline the indirection
|
||||
void dyn_llama_server_init(struct dynamic_llama_server s,
|
||||
ext_server_params_t *sparams,
|
||||
ext_server_resp_t *err);
|
||||
|
||||
void dyn_llama_server_start(struct dynamic_llama_server s);
|
||||
|
||||
void dyn_llama_server_stop(struct dynamic_llama_server s);
|
||||
|
||||
void dyn_llama_server_completion(struct dynamic_llama_server s,
|
||||
const char *json_req,
|
||||
ext_server_resp_t *resp);
|
||||
|
||||
void dyn_llama_server_completion_next_result(
|
||||
struct dynamic_llama_server s, const int task_id,
|
||||
ext_server_task_result_t *result);
|
||||
|
||||
void dyn_llama_server_completion_cancel(struct dynamic_llama_server s,
|
||||
const int task_id,
|
||||
ext_server_resp_t *err);
|
||||
|
||||
void dyn_llama_server_release_task_result(
|
||||
struct dynamic_llama_server s, ext_server_task_result_t *result);
|
||||
|
||||
void dyn_llama_server_tokenize(struct dynamic_llama_server s,
|
||||
const char *json_req, char **json_resp,
|
||||
ext_server_resp_t *err);
|
||||
|
||||
void dyn_llama_server_detokenize(struct dynamic_llama_server s,
|
||||
const char *json_req,
|
||||
char **json_resp,
|
||||
ext_server_resp_t *err);
|
||||
|
||||
void dyn_llama_server_embedding(struct dynamic_llama_server s,
|
||||
const char *json_req, char **json_resp,
|
||||
ext_server_resp_t *err);
|
||||
void dyn_llama_server_release_json_resp(struct dynamic_llama_server s,
|
||||
char **json_resp);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
@ -1,25 +0,0 @@
|
||||
# Ollama specific CMakefile to include in llama.cpp/examples/server
|
||||
|
||||
set(TARGET ext_server)
|
||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
||||
if (WIN32)
|
||||
add_library(${TARGET} SHARED ../../../ext_server/ext_server.cpp ../../llama.cpp)
|
||||
else()
|
||||
add_library(${TARGET} STATIC ../../../ext_server/ext_server.cpp ../../llama.cpp)
|
||||
endif()
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_include_directories(${TARGET} PRIVATE ../..)
|
||||
target_include_directories(${TARGET} PRIVATE ../../..)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1)
|
||||
target_link_libraries(${TARGET} PRIVATE ggml llava common )
|
||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>)
|
||||
install(TARGETS ext_server LIBRARY)
|
||||
|
||||
if (CUDAToolkit_FOUND)
|
||||
target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
|
||||
if (WIN32)
|
||||
target_link_libraries(${TARGET} PRIVATE nvml)
|
||||
endif()
|
||||
endif()
|
@ -1,18 +0,0 @@
|
||||
# Extern C Server
|
||||
|
||||
This directory contains a thin facade we layer on top of the Llama.cpp server to
|
||||
expose `extern C` interfaces to access the functionality through direct API
|
||||
calls in-process. The llama.cpp code uses compile time macros to configure GPU
|
||||
type along with other settings. During the `go generate ./...` execution, the
|
||||
build will generate one or more copies of the llama.cpp `extern C` server based
|
||||
on what GPU libraries are detected to support multiple GPU types as well as CPU
|
||||
only support. The Ollama go build then embeds these different servers to support
|
||||
different GPUs and settings at runtime.
|
||||
|
||||
If you are making changes to the code in this directory, make sure to disable
|
||||
caching during your go build to ensure you pick up your changes. A typical
|
||||
iteration cycle from the top of the source tree looks like:
|
||||
|
||||
```
|
||||
go generate ./... && go build -a .
|
||||
```
|
@ -1,381 +0,0 @@
|
||||
#include "ext_server.h"
|
||||
#include <atomic>
|
||||
|
||||
// Necessary evil since the server types are not defined in a header
|
||||
#include "server.cpp"
|
||||
|
||||
// Low level API access to verify GPU access
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
#if defined(GGML_USE_HIPBLAS)
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hipblas/hipblas.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
// for rocblas_initialize()
|
||||
#include "rocblas/rocblas.h"
|
||||
#endif // __HIP_PLATFORM_AMD__
|
||||
#define cudaGetDevice hipGetDevice
|
||||
#define cudaError_t hipError_t
|
||||
#define cudaSuccess hipSuccess
|
||||
#define cudaGetErrorString hipGetErrorString
|
||||
#else
|
||||
#include <cuda_runtime.h>
|
||||
#include <cublas_v2.h>
|
||||
#include <cuda_fp16.h>
|
||||
#endif // defined(GGML_USE_HIPBLAS)
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
||||
// Expose the llama server as a callable extern "C" API
|
||||
llama_server_context *llama = NULL;
|
||||
std::thread ext_server_thread;
|
||||
bool shutting_down = false;
|
||||
std::atomic_int recv_counter;
|
||||
|
||||
// RAII wrapper for tracking in-flight recv calls
|
||||
class atomicRecv {
|
||||
public:
|
||||
atomicRecv(std::atomic<int> &atomic) : atomic(atomic) {
|
||||
++this->atomic;
|
||||
}
|
||||
~atomicRecv() {
|
||||
--this->atomic;
|
||||
}
|
||||
private:
|
||||
std::atomic<int> &atomic;
|
||||
};
|
||||
|
||||
void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
|
||||
recv_counter = 0;
|
||||
assert(err != NULL && sparams != NULL);
|
||||
log_set_target(stderr);
|
||||
if (!sparams->verbose_logging) {
|
||||
server_verbose = true;
|
||||
log_disable();
|
||||
}
|
||||
|
||||
LOG_TEE("system info: %s\n", llama_print_system_info());
|
||||
err->id = 0;
|
||||
err->msg[0] = '\0';
|
||||
try {
|
||||
llama = new llama_server_context;
|
||||
gpt_params params;
|
||||
params.n_ctx = sparams->n_ctx;
|
||||
params.n_batch = sparams->n_batch;
|
||||
if (sparams->n_threads > 0) {
|
||||
params.n_threads = sparams->n_threads;
|
||||
}
|
||||
params.n_parallel = sparams->n_parallel;
|
||||
params.rope_freq_base = sparams->rope_freq_base;
|
||||
params.rope_freq_scale = sparams->rope_freq_scale;
|
||||
|
||||
if (sparams->memory_f16) {
|
||||
params.cache_type_k = "f16";
|
||||
params.cache_type_v = "f16";
|
||||
} else {
|
||||
params.cache_type_k = "f32";
|
||||
params.cache_type_v = "f32";
|
||||
}
|
||||
|
||||
params.n_gpu_layers = sparams->n_gpu_layers;
|
||||
params.main_gpu = sparams->main_gpu;
|
||||
params.use_mlock = sparams->use_mlock;
|
||||
params.use_mmap = sparams->use_mmap;
|
||||
params.numa = (ggml_numa_strategy)sparams->numa;
|
||||
params.embedding = sparams->embedding;
|
||||
if (sparams->model != NULL) {
|
||||
params.model = sparams->model;
|
||||
}
|
||||
|
||||
if (sparams->lora_adapters != NULL) {
|
||||
for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL;
|
||||
la = la->next) {
|
||||
params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
|
||||
}
|
||||
|
||||
params.use_mmap = false;
|
||||
}
|
||||
|
||||
if (sparams->mmproj != NULL) {
|
||||
params.mmproj = std::string(sparams->mmproj);
|
||||
}
|
||||
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
// Before attempting to init the backend which will assert on error, verify the CUDA/ROCM GPU is accessible
|
||||
LOG_TEE("Performing pre-initialization of GPU\n");
|
||||
int id;
|
||||
cudaError_t cudaErr = cudaGetDevice(&id);
|
||||
if (cudaErr != cudaSuccess) {
|
||||
err->id = -1;
|
||||
snprintf(err->msg, err->msg_len, "Unable to init GPU: %s", cudaGetErrorString(cudaErr));
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
// load the model
|
||||
if (!llama->load_model(params)) {
|
||||
// TODO - consider modifying the logging logic or patching load_model so
|
||||
// we can capture more detailed error messages and pass them back to the
|
||||
// caller for better UX
|
||||
err->id = -1;
|
||||
snprintf(err->msg, err->msg_len, "error loading model %s",
|
||||
params.model.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
llama->initialize();
|
||||
} catch (std::exception &e) {
|
||||
err->id = -1;
|
||||
snprintf(err->msg, err->msg_len, "exception %s", e.what());
|
||||
} catch (...) {
|
||||
err->id = -1;
|
||||
snprintf(err->msg, err->msg_len,
|
||||
"Unknown exception initializing llama server");
|
||||
}
|
||||
}
|
||||
|
||||
void llama_server_start() {
|
||||
assert(llama != NULL);
|
||||
// TODO mutex to protect thread creation
|
||||
ext_server_thread = std::thread([&]() {
|
||||
try {
|
||||
LOG_TEE("llama server main loop starting\n");
|
||||
ggml_time_init();
|
||||
llama->queue_tasks.on_new_task(std::bind(
|
||||
&llama_server_context::process_single_task, llama, std::placeholders::_1));
|
||||
llama->queue_tasks.on_finish_multitask(std::bind(
|
||||
&llama_server_context::on_finish_multitask, llama, std::placeholders::_1));
|
||||
llama->queue_tasks.on_run_slots(std::bind(
|
||||
&llama_server_context::update_slots, llama));
|
||||
llama->queue_results.on_multitask_update(std::bind(
|
||||
&llama_server_queue::update_multitask,
|
||||
&llama->queue_tasks,
|
||||
std::placeholders::_1,
|
||||
std::placeholders::_2,
|
||||
std::placeholders::_3
|
||||
));
|
||||
llama->queue_tasks.start_loop();
|
||||
} catch (std::exception &e) {
|
||||
LOG_TEE("caught exception in llama server main loop: %s\n", e.what());
|
||||
} catch (...) {
|
||||
LOG_TEE("caught unknown exception in llama server main loop\n");
|
||||
}
|
||||
LOG_TEE("\nllama server shutting down\n");
|
||||
llama_backend_free();
|
||||
});
|
||||
}
|
||||
|
||||
void llama_server_stop() {
|
||||
assert(llama != NULL);
|
||||
// Shutdown any in-flight requests and block incoming requests.
|
||||
LOG_TEE("\ninitiating shutdown - draining remaining tasks...\n");
|
||||
shutting_down = true;
|
||||
|
||||
while (recv_counter.load() > 0) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(50));
|
||||
}
|
||||
|
||||
// This may take a while for any pending tasks to drain
|
||||
// TODO - consider a timeout to cancel tasks if it's taking too long
|
||||
llama->queue_tasks.terminate();
|
||||
ext_server_thread.join();
|
||||
delete llama;
|
||||
llama = NULL;
|
||||
LOG_TEE("llama server shutdown complete\n");
|
||||
shutting_down = false;
|
||||
}
|
||||
|
||||
void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
|
||||
assert(llama != NULL && json_req != NULL && resp != NULL);
|
||||
resp->id = -1;
|
||||
resp->msg[0] = '\0';
|
||||
try {
|
||||
if (shutting_down) {
|
||||
throw std::runtime_error("server shutting down");
|
||||
}
|
||||
json data = json::parse(json_req);
|
||||
resp->id = llama->queue_tasks.get_new_id();
|
||||
llama->queue_results.add_waiting_task_id(resp->id);
|
||||
llama->request_completion(resp->id, data, false, false, -1);
|
||||
} catch (std::exception &e) {
|
||||
snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
|
||||
} catch (...) {
|
||||
snprintf(resp->msg, resp->msg_len, "Unknown exception during completion");
|
||||
}
|
||||
}
|
||||
|
||||
void llama_server_completion_next_result(const int task_id,
|
||||
ext_server_task_result_t *resp) {
|
||||
assert(llama != NULL && resp != NULL);
|
||||
resp->id = -1;
|
||||
resp->stop = false;
|
||||
resp->error = false;
|
||||
resp->json_resp = NULL;
|
||||
std::string result_json;
|
||||
try {
|
||||
atomicRecv ar(recv_counter);
|
||||
task_result result = llama->queue_results.recv(task_id);
|
||||
result_json =
|
||||
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
|
||||
resp->id = result.id;
|
||||
resp->stop = result.stop;
|
||||
resp->error = result.error;
|
||||
if (result.error) {
|
||||
LOG_TEE("next result cancel on error\n");
|
||||
llama->request_cancel(task_id);
|
||||
LOG_TEE("next result removing waiting tak ID: %d\n", task_id);
|
||||
llama->queue_results.remove_waiting_task_id(task_id);
|
||||
} else if (result.stop) {
|
||||
LOG_TEE("next result cancel on stop\n");
|
||||
llama->request_cancel(task_id);
|
||||
LOG_TEE("next result removing waiting task ID: %d\n", task_id);
|
||||
llama->queue_results.remove_waiting_task_id(task_id);
|
||||
} else if (shutting_down) {
|
||||
LOG_TEE("aborting completion due to shutdown %d\n", task_id);
|
||||
llama->request_cancel(task_id);
|
||||
llama->queue_results.remove_waiting_task_id(task_id);
|
||||
resp->stop = true;
|
||||
}
|
||||
} catch (std::exception &e) {
|
||||
resp->error = true;
|
||||
resp->id = -1;
|
||||
result_json = "{\"error\":\"exception " + std::string(e.what()) + "\"}";
|
||||
LOG_TEE("llama server completion exception %s\n", e.what());
|
||||
} catch (...) {
|
||||
resp->error = true;
|
||||
resp->id = -1;
|
||||
result_json = "{\"error\":\"Unknown exception during completion\"}";
|
||||
LOG_TEE("llama server completion unknown exception\n");
|
||||
}
|
||||
const std::string::size_type size = result_json.size() + 1;
|
||||
resp->json_resp = new char[size];
|
||||
snprintf(resp->json_resp, size, "%s", result_json.c_str());
|
||||
}
|
||||
|
||||
void llama_server_release_task_result(ext_server_task_result_t *result) {
|
||||
if (result == NULL || result->json_resp == NULL) {
|
||||
return;
|
||||
}
|
||||
delete[] result->json_resp;
|
||||
}
|
||||
|
||||
void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) {
|
||||
assert(llama != NULL && err != NULL);
|
||||
err->id = 0;
|
||||
err->msg[0] = '\0';
|
||||
try {
|
||||
llama->request_cancel(task_id);
|
||||
llama->queue_results.remove_waiting_task_id(task_id);
|
||||
} catch (std::exception &e) {
|
||||
err->id = -1;
|
||||
snprintf(err->msg, err->msg_len, "exception %s", e.what());
|
||||
} catch (...) {
|
||||
err->id = -1;
|
||||
snprintf(err->msg, err->msg_len,
|
||||
"Unknown exception completion cancel in llama server");
|
||||
}
|
||||
}
|
||||
|
||||
void llama_server_tokenize(const char *json_req, char **json_resp,
|
||||
ext_server_resp_t *err) {
|
||||
assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
|
||||
*json_resp = NULL;
|
||||
err->id = 0;
|
||||
err->msg[0] = '\0';
|
||||
try {
|
||||
if (shutting_down) {
|
||||
throw std::runtime_error("server shutting down");
|
||||
}
|
||||
const json body = json::parse(json_req);
|
||||
std::vector<llama_token> tokens;
|
||||
if (body.count("content") != 0) {
|
||||
tokens = llama->tokenize(body["content"], false);
|
||||
}
|
||||
const json data = format_tokenizer_response(tokens);
|
||||
std::string result_json = data.dump();
|
||||
const std::string::size_type size = result_json.size() + 1;
|
||||
*json_resp = new char[size];
|
||||
snprintf(*json_resp, size, "%s", result_json.c_str());
|
||||
} catch (std::exception &e) {
|
||||
err->id = -1;
|
||||
snprintf(err->msg, err->msg_len, "exception %s", e.what());
|
||||
} catch (...) {
|
||||
err->id = -1;
|
||||
snprintf(err->msg, err->msg_len, "Unknown exception during tokenize");
|
||||
}
|
||||
}
|
||||
|
||||
void llama_server_release_json_resp(char **json_resp) {
|
||||
if (json_resp == NULL || *json_resp == NULL) {
|
||||
return;
|
||||
}
|
||||
delete[] *json_resp;
|
||||
}
|
||||
|
||||
void llama_server_detokenize(const char *json_req, char **json_resp,
|
||||
ext_server_resp_t *err) {
|
||||
assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
|
||||
*json_resp = NULL;
|
||||
err->id = 0;
|
||||
err->msg[0] = '\0';
|
||||
try {
|
||||
if (shutting_down) {
|
||||
throw std::runtime_error("server shutting down");
|
||||
}
|
||||
const json body = json::parse(json_req);
|
||||
std::string content;
|
||||
if (body.count("tokens") != 0) {
|
||||
const std::vector<llama_token> tokens = body["tokens"];
|
||||
content = tokens_to_str(llama->ctx, tokens.cbegin(), tokens.cend());
|
||||
}
|
||||
const json data = format_detokenized_response(content);
|
||||
std::string result_json = data.dump();
|
||||
const std::string::size_type size = result_json.size() + 1;
|
||||
*json_resp = new char[size];
|
||||
snprintf(*json_resp, size, "%s", result_json.c_str());
|
||||
} catch (std::exception &e) {
|
||||
err->id = -1;
|
||||
snprintf(err->msg, err->msg_len, "exception %s", e.what());
|
||||
} catch (...) {
|
||||
err->id = -1;
|
||||
snprintf(err->msg, err->msg_len, "Unknown exception during detokenize");
|
||||
}
|
||||
}
|
||||
|
||||
void llama_server_embedding(const char *json_req, char **json_resp,
|
||||
ext_server_resp_t *err) {
|
||||
assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
|
||||
*json_resp = NULL;
|
||||
err->id = 0;
|
||||
err->msg[0] = '\0';
|
||||
try {
|
||||
if (shutting_down) {
|
||||
throw std::runtime_error("server shutting down");
|
||||
}
|
||||
const json body = json::parse(json_req);
|
||||
json prompt;
|
||||
if (body.count("content") != 0) {
|
||||
prompt = body["content"];
|
||||
} else {
|
||||
prompt = "";
|
||||
}
|
||||
const int task_id = llama->queue_tasks.get_new_id();
|
||||
llama->queue_results.add_waiting_task_id(task_id);
|
||||
llama->request_completion(task_id, {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1);
|
||||
atomicRecv ar(recv_counter);
|
||||
task_result result = llama->queue_results.recv(task_id);
|
||||
std::string result_json = result.result_json.dump();
|
||||
const std::string::size_type size = result_json.size() + 1;
|
||||
*json_resp = new char[size];
|
||||
snprintf(*json_resp, size, "%s", result_json.c_str());
|
||||
llama->queue_results.remove_waiting_task_id(task_id);
|
||||
} catch (std::exception &e) {
|
||||
err->id = -1;
|
||||
snprintf(err->msg, err->msg_len, "exception %s", e.what());
|
||||
} catch (...) {
|
||||
err->id = -1;
|
||||
snprintf(err->msg, err->msg_len, "Unknown exception during embedding");
|
||||
}
|
||||
}
|
@ -1,95 +0,0 @@
|
||||
#if defined(LLAMA_SERVER_LIBRARY)
|
||||
#ifndef LLAMA_SERVER_H
|
||||
#define LLAMA_SERVER_H
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
int __main(int argc, char **argv);
|
||||
|
||||
// This exposes extern C entrypoints into the llama_server
|
||||
// To enable the server compile with LLAMA_SERVER_LIBRARY
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
typedef struct ext_server_resp {
|
||||
int id; // < 0 on error
|
||||
size_t msg_len; // caller must allocate msg and set msg_len
|
||||
char *msg;
|
||||
} ext_server_resp_t;
|
||||
|
||||
// Allocated and freed by caller
|
||||
typedef struct ext_server_lora_adapter {
|
||||
char *adapter;
|
||||
float scale;
|
||||
struct ext_server_lora_adapter *next;
|
||||
} ext_server_lora_adapter_t;
|
||||
|
||||
// Allocated and freed by caller
|
||||
typedef struct ext_server_params {
|
||||
char *model;
|
||||
uint32_t n_ctx; // token context window, 0 = from model
|
||||
uint32_t n_batch; // prompt processing maximum batch size
|
||||
uint32_t n_threads; // number of threads to use for generation
|
||||
int32_t n_parallel; // number of parallel sequences to decodewra
|
||||
float rope_freq_base; // RoPE base frequency, 0 = from model
|
||||
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
||||
bool memory_f16; // use f16 instead of f32 for memory kv
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool use_mmap; // use mmap if possible
|
||||
int numa; // attempt optimizations that help on some NUMA systems
|
||||
bool embedding; // get only sentence embedding
|
||||
ext_server_lora_adapter_t *lora_adapters;
|
||||
char *mmproj;
|
||||
bool verbose_logging; // Enable verbose logging of the server
|
||||
} ext_server_params_t;
|
||||
|
||||
typedef struct ext_server_task_result {
|
||||
int id;
|
||||
bool stop;
|
||||
bool error;
|
||||
char *json_resp; // null terminated, memory managed by ext_server
|
||||
} ext_server_task_result_t;
|
||||
|
||||
// Initialize the server once per process
|
||||
// err->id = 0 for success and err->msg[0] = NULL
|
||||
// err->id != 0 for failure, and err->msg contains error message
|
||||
void llama_server_init(ext_server_params_t *sparams, ext_server_resp_t *err);
|
||||
|
||||
// Run the main loop, called once per init
|
||||
void llama_server_start();
|
||||
// Stop the main loop and free up resources allocated in init and start. Init
|
||||
// must be called again to reuse
|
||||
void llama_server_stop();
|
||||
|
||||
// json_req null terminated string, memory managed by caller
|
||||
// resp->id >= 0 on success (task ID)
|
||||
// resp->id < 0 on error, and resp->msg contains error message
|
||||
void llama_server_completion(const char *json_req, ext_server_resp_t *resp);
|
||||
|
||||
// Caller must call llama_server_release_task_result to free resp->json_resp
|
||||
void llama_server_completion_next_result(const int task_id,
|
||||
ext_server_task_result_t *result);
|
||||
void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err);
|
||||
void llama_server_release_task_result(ext_server_task_result_t *result);
|
||||
|
||||
// Caller must call llama_server_releaes_json_resp to free json_resp if err.id <
|
||||
// 0
|
||||
void llama_server_tokenize(const char *json_req, char **json_resp,
|
||||
ext_server_resp_t *err);
|
||||
void llama_server_detokenize(const char *json_req, char **json_resp,
|
||||
ext_server_resp_t *err);
|
||||
void llama_server_embedding(const char *json_req, char **json_resp,
|
||||
ext_server_resp_t *err);
|
||||
void llama_server_release_json_resp(char **json_resp);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif // LLAMA_SERVER_LIBRARY
|
@ -1,125 +0,0 @@
|
||||
# common logic accross linux and darwin
|
||||
|
||||
init_vars() {
|
||||
case "${GOARCH}" in
|
||||
"amd64")
|
||||
ARCH="x86_64"
|
||||
;;
|
||||
"arm64")
|
||||
ARCH="arm64"
|
||||
;;
|
||||
*)
|
||||
ARCH=$(uname -m | sed -e "s/aarch64/arm64/g")
|
||||
esac
|
||||
|
||||
LLAMACPP_DIR=../llama.cpp
|
||||
CMAKE_DEFS=""
|
||||
CMAKE_TARGETS="--target ext_server"
|
||||
if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
|
||||
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
|
||||
else
|
||||
# TODO - add additional optimization flags...
|
||||
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
|
||||
fi
|
||||
case $(uname -s) in
|
||||
"Darwin")
|
||||
LIB_EXT="dylib"
|
||||
WHOLE_ARCHIVE="-Wl,-force_load"
|
||||
NO_WHOLE_ARCHIVE=""
|
||||
GCC_ARCH="-arch ${ARCH}"
|
||||
;;
|
||||
"Linux")
|
||||
LIB_EXT="so"
|
||||
WHOLE_ARCHIVE="-Wl,--whole-archive"
|
||||
NO_WHOLE_ARCHIVE="-Wl,--no-whole-archive"
|
||||
|
||||
# Cross compiling not supported on linux - Use docker
|
||||
GCC_ARCH=""
|
||||
;;
|
||||
*)
|
||||
;;
|
||||
esac
|
||||
if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
|
||||
CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
|
||||
fi
|
||||
}
|
||||
|
||||
git_module_setup() {
|
||||
if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
|
||||
echo "Skipping submodule initialization"
|
||||
return
|
||||
fi
|
||||
# Make sure the tree is clean after the directory moves
|
||||
if [ -d "${LLAMACPP_DIR}/gguf" ]; then
|
||||
echo "Cleaning up old submodule"
|
||||
rm -rf ${LLAMACPP_DIR}
|
||||
fi
|
||||
git submodule init
|
||||
git submodule update --force ${LLAMACPP_DIR}
|
||||
|
||||
}
|
||||
|
||||
apply_patches() {
|
||||
# Wire up our CMakefile
|
||||
if ! grep ollama ${LLAMACPP_DIR}/examples/server/CMakeLists.txt; then
|
||||
echo 'include (../../../ext_server/CMakeLists.txt) # ollama' >>${LLAMACPP_DIR}/examples/server/CMakeLists.txt
|
||||
fi
|
||||
|
||||
if [ -n "$(ls -A ../patches/*.diff)" ]; then
|
||||
# apply temporary patches until fix is upstream
|
||||
for patch in ../patches/*.diff; do
|
||||
for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
|
||||
(cd ${LLAMACPP_DIR}; git checkout ${file})
|
||||
done
|
||||
done
|
||||
for patch in ../patches/*.diff; do
|
||||
(cd ${LLAMACPP_DIR} && git apply ${patch})
|
||||
done
|
||||
fi
|
||||
|
||||
# Avoid duplicate main symbols when we link into the cgo binary
|
||||
sed -e 's/int main(/int __main(/g' <${LLAMACPP_DIR}/examples/server/server.cpp >${LLAMACPP_DIR}/examples/server/server.cpp.tmp &&
|
||||
mv ${LLAMACPP_DIR}/examples/server/server.cpp.tmp ${LLAMACPP_DIR}/examples/server/server.cpp
|
||||
}
|
||||
|
||||
build() {
|
||||
cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
|
||||
cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
|
||||
mkdir -p ${BUILD_DIR}/lib/
|
||||
g++ -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.${LIB_EXT} \
|
||||
${GCC_ARCH} \
|
||||
${WHOLE_ARCHIVE} ${BUILD_DIR}/examples/server/libext_server.a ${NO_WHOLE_ARCHIVE} \
|
||||
${BUILD_DIR}/common/libcommon.a \
|
||||
${BUILD_DIR}/libllama.a \
|
||||
-Wl,-rpath,\$ORIGIN \
|
||||
-lpthread -ldl -lm \
|
||||
${EXTRA_LIBS}
|
||||
}
|
||||
|
||||
compress_libs() {
|
||||
echo "Compressing payloads to reduce overall binary size..."
|
||||
pids=""
|
||||
rm -rf ${BUILD_DIR}/lib/*.${LIB_EXT}*.gz
|
||||
for lib in ${BUILD_DIR}/lib/*.${LIB_EXT}* ; do
|
||||
gzip -n --best -f ${lib} &
|
||||
pids+=" $!"
|
||||
done
|
||||
echo
|
||||
for pid in ${pids}; do
|
||||
wait $pid
|
||||
done
|
||||
echo "Finished compression"
|
||||
}
|
||||
|
||||
# Keep the local tree clean after we're done with the build
|
||||
cleanup() {
|
||||
(cd ${LLAMACPP_DIR}/examples/server/ && git checkout CMakeLists.txt server.cpp)
|
||||
|
||||
if [ -n "$(ls -A ../patches/*.diff)" ]; then
|
||||
for patch in ../patches/*.diff; do
|
||||
for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
|
||||
(cd ${LLAMACPP_DIR}; git checkout ${file})
|
||||
done
|
||||
done
|
||||
fi
|
||||
}
|
@ -1,77 +0,0 @@
|
||||
#!/bin/bash
|
||||
# This script is intended to run inside the go generate
|
||||
# working directory must be ./llm/generate/
|
||||
|
||||
# TODO - add hardening to detect missing tools (cmake, etc.)
|
||||
|
||||
set -ex
|
||||
set -o pipefail
|
||||
echo "Starting darwin generate script"
|
||||
source $(dirname $0)/gen_common.sh
|
||||
init_vars
|
||||
git_module_setup
|
||||
apply_patches
|
||||
|
||||
sign() {
|
||||
if [ -n "$APPLE_IDENTITY" ]; then
|
||||
codesign -f --timestamp --deep --options=runtime --sign "$APPLE_IDENTITY" --identifier ai.ollama.ollama $1
|
||||
fi
|
||||
}
|
||||
|
||||
COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin"
|
||||
|
||||
case "${GOARCH}" in
|
||||
"amd64")
|
||||
COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_NATIVE=off"
|
||||
|
||||
#
|
||||
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
|
||||
#
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu"
|
||||
echo "Building LCD CPU"
|
||||
build
|
||||
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu/lib/libext_server.dylib
|
||||
compress_libs
|
||||
|
||||
#
|
||||
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
|
||||
# Approximately 400% faster than LCD on same CPU
|
||||
#
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx"
|
||||
echo "Building AVX CPU"
|
||||
build
|
||||
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx/lib/libext_server.dylib
|
||||
compress_libs
|
||||
|
||||
#
|
||||
# ~2013 CPU Dynamic library
|
||||
# Approximately 10% faster than AVX on same CPU
|
||||
#
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2"
|
||||
echo "Building AVX2 CPU"
|
||||
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
|
||||
build
|
||||
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2/lib/libext_server.dylib
|
||||
compress_libs
|
||||
;;
|
||||
"arm64")
|
||||
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal"
|
||||
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
|
||||
build
|
||||
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/metal/lib/libext_server.dylib
|
||||
compress_libs
|
||||
;;
|
||||
*)
|
||||
echo "GOARCH must be set"
|
||||
echo "this script is meant to be run from within go generate"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
cleanup
|
@ -1,196 +0,0 @@
|
||||
#!/bin/bash
|
||||
# This script is intended to run inside the go generate
|
||||
# working directory must be llm/generate/
|
||||
|
||||
# First we build one or more CPU based LLM libraries
|
||||
#
|
||||
# Then if we detect CUDA, we build a CUDA dynamic library, and carry the required
|
||||
# library dependencies
|
||||
#
|
||||
# Then if we detect ROCm, we build a dynamically loaded ROCm lib. The ROCM
|
||||
# libraries are quite large, and also dynamically load data files at runtime
|
||||
# which in turn are large, so we don't attempt to cary them as payload
|
||||
|
||||
set -ex
|
||||
set -o pipefail
|
||||
|
||||
# See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
|
||||
amdGPUs() {
|
||||
if [ -n "${AMDGPU_TARGETS}" ]; then
|
||||
echo "${AMDGPU_TARGETS}"
|
||||
return
|
||||
fi
|
||||
GPU_LIST=(
|
||||
"gfx900"
|
||||
"gfx906:xnack-"
|
||||
"gfx908:xnack-"
|
||||
"gfx90a:xnack+"
|
||||
"gfx90a:xnack-"
|
||||
"gfx1010"
|
||||
"gfx1012"
|
||||
"gfx1030"
|
||||
"gfx1100"
|
||||
"gfx1101"
|
||||
"gfx1102"
|
||||
)
|
||||
(
|
||||
IFS=$';'
|
||||
echo "'${GPU_LIST[*]}'"
|
||||
)
|
||||
}
|
||||
|
||||
echo "Starting linux generate script"
|
||||
if [ -z "${CUDACXX}" ]; then
|
||||
if [ -x /usr/local/cuda/bin/nvcc ]; then
|
||||
export CUDACXX=/usr/local/cuda/bin/nvcc
|
||||
else
|
||||
# Try the default location in case it exists
|
||||
export CUDACXX=$(command -v nvcc)
|
||||
fi
|
||||
fi
|
||||
COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
|
||||
source $(dirname $0)/gen_common.sh
|
||||
init_vars
|
||||
git_module_setup
|
||||
apply_patches
|
||||
|
||||
if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
||||
# Users building from source can tune the exact flags we pass to cmake for configuring
|
||||
# llama.cpp, and we'll build only 1 CPU variant in that case as the default.
|
||||
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
|
||||
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
|
||||
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
|
||||
echo "Building custom CPU"
|
||||
build
|
||||
compress_libs
|
||||
else
|
||||
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
|
||||
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
|
||||
# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
|
||||
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
|
||||
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
|
||||
# Note: the following seem to yield slower results than AVX2 - ymmv
|
||||
# -DLLAMA_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
|
||||
# -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
|
||||
# -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
|
||||
|
||||
COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
|
||||
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
|
||||
#
|
||||
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
|
||||
#
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
|
||||
echo "Building LCD CPU"
|
||||
build
|
||||
compress_libs
|
||||
fi
|
||||
|
||||
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
|
||||
#
|
||||
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
|
||||
# Approximately 400% faster than LCD on same CPU
|
||||
#
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
|
||||
echo "Building AVX CPU"
|
||||
build
|
||||
compress_libs
|
||||
fi
|
||||
|
||||
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
|
||||
#
|
||||
# ~2013 CPU Dynamic library
|
||||
# Approximately 10% faster than AVX on same CPU
|
||||
#
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
|
||||
echo "Building AVX2 CPU"
|
||||
build
|
||||
compress_libs
|
||||
fi
|
||||
fi
|
||||
else
|
||||
echo "Skipping CPU generation step as requested"
|
||||
fi
|
||||
|
||||
# If needed, look for the default CUDA toolkit location
|
||||
if [ -z "${CUDA_LIB_DIR}" ] && [ -d /usr/local/cuda/lib64 ]; then
|
||||
CUDA_LIB_DIR=/usr/local/cuda/lib64
|
||||
fi
|
||||
|
||||
# If needed, look for CUDA on Arch Linux
|
||||
if [ -z "${CUDA_LIB_DIR}" ] && [ -d /opt/cuda/targets/x86_64-linux/lib ]; then
|
||||
CUDA_LIB_DIR=/opt/cuda/targets/x86_64-linux/lib
|
||||
fi
|
||||
|
||||
# Allow override in case libcudart is in the wrong place
|
||||
if [ -z "${CUDART_LIB_DIR}" ]; then
|
||||
CUDART_LIB_DIR="${CUDA_LIB_DIR}"
|
||||
fi
|
||||
|
||||
if [ -d "${CUDA_LIB_DIR}" ]; then
|
||||
echo "CUDA libraries detected - building dynamic CUDA library"
|
||||
init_vars
|
||||
CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
|
||||
if [ -n "${CUDA_MAJOR}" ]; then
|
||||
CUDA_VARIANT=_v${CUDA_MAJOR}
|
||||
fi
|
||||
CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
|
||||
EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
|
||||
build
|
||||
|
||||
# Cary the CUDA libs as payloads to help reduce dependency burden on users
|
||||
#
|
||||
# TODO - in the future we may shift to packaging these separately and conditionally
|
||||
# downloading them in the install script.
|
||||
DEPS="$(ldd ${BUILD_DIR}/lib/libext_server.so )"
|
||||
for lib in libcudart.so libcublas.so libcublasLt.so ; do
|
||||
DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
|
||||
if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
|
||||
cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/lib/"
|
||||
elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
|
||||
cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/lib/"
|
||||
elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
|
||||
cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/lib/"
|
||||
else
|
||||
cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/lib/"
|
||||
fi
|
||||
done
|
||||
compress_libs
|
||||
|
||||
fi
|
||||
|
||||
if [ -z "${ROCM_PATH}" ]; then
|
||||
# Try the default location in case it exists
|
||||
ROCM_PATH=/opt/rocm
|
||||
fi
|
||||
|
||||
if [ -z "${CLBlast_DIR}" ]; then
|
||||
# Try the default location in case it exists
|
||||
if [ -d /usr/lib/cmake/CLBlast ]; then
|
||||
export CLBlast_DIR=/usr/lib/cmake/CLBlast
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -d "${ROCM_PATH}" ]; then
|
||||
echo "ROCm libraries detected - building dynamic ROCm library"
|
||||
if [ -f ${ROCM_PATH}/lib/librocm_smi64.so.? ]; then
|
||||
ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocm_smi64.so.? | cut -f3 -d. || true)
|
||||
fi
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}"
|
||||
EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,${ROCM_PATH}/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
|
||||
build
|
||||
|
||||
# Note: the ROCM libs and runtime library files are too large to embed, so we depend on
|
||||
# them being present at runtime on the host
|
||||
compress_libs
|
||||
fi
|
||||
|
||||
cleanup
|
@ -1,209 +0,0 @@
|
||||
#!powershell
|
||||
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
function init_vars {
|
||||
$script:SRC_DIR = $(resolve-path "..\..\")
|
||||
$script:llamacppDir = "../llama.cpp"
|
||||
$script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-A", "x64")
|
||||
$script:cmakeTargets = @("ext_server")
|
||||
$script:ARCH = "amd64" # arm not yet supported.
|
||||
if ($env:CGO_CFLAGS -contains "-g") {
|
||||
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on")
|
||||
$script:config = "RelWithDebInfo"
|
||||
} else {
|
||||
$script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off")
|
||||
$script:config = "Release"
|
||||
}
|
||||
# Try to find the CUDA dir
|
||||
if ($env:CUDA_LIB_DIR -eq $null) {
|
||||
$d=(get-command -ea 'silentlycontinue' nvcc).path
|
||||
if ($d -ne $null) {
|
||||
$script:CUDA_LIB_DIR=($d| split-path -parent)
|
||||
$script:CUDA_INCLUDE_DIR=($script:CUDA_LIB_DIR|split-path -parent)+"\include"
|
||||
}
|
||||
} else {
|
||||
$script:CUDA_LIB_DIR=$env:CUDA_LIB_DIR
|
||||
}
|
||||
$script:GZIP=(get-command -ea 'silentlycontinue' gzip).path
|
||||
$script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
|
||||
if ($null -eq $env:CMAKE_CUDA_ARCHITECTURES) {
|
||||
$script:CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
|
||||
} else {
|
||||
$script:CMAKE_CUDA_ARCHITECTURES=$env:CMAKE_CUDA_ARCHITECTURES
|
||||
}
|
||||
# Note: 10 Windows Kit signtool crashes with GCP's plugin
|
||||
${script:SignTool}="C:\Program Files (x86)\Windows Kits\8.1\bin\x64\signtool.exe"
|
||||
if ("${env:KEY_CONTAINER}") {
|
||||
${script:OLLAMA_CERT}=$(resolve-path "${script:SRC_DIR}\ollama_inc.crt")
|
||||
}
|
||||
}
|
||||
|
||||
function git_module_setup {
|
||||
# TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
|
||||
& git submodule init
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
& git submodule update --force "${script:llamacppDir}"
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
}
|
||||
|
||||
function apply_patches {
|
||||
# Wire up our CMakefile
|
||||
if (!(Select-String -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Pattern 'ollama')) {
|
||||
Add-Content -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Value 'include (../../../ext_server/CMakeLists.txt) # ollama'
|
||||
}
|
||||
|
||||
# Apply temporary patches until fix is upstream
|
||||
$patches = Get-ChildItem "../patches/*.diff"
|
||||
foreach ($patch in $patches) {
|
||||
# Extract file paths from the patch file
|
||||
$filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object {
|
||||
$parts = $_ -split ' '
|
||||
($parts[1] -split '/', 2)[1]
|
||||
}
|
||||
|
||||
# Checkout each file
|
||||
Set-Location -Path ${script:llamacppDir}
|
||||
foreach ($file in $filePaths) {
|
||||
git checkout $file
|
||||
}
|
||||
}
|
||||
|
||||
# Apply each patch
|
||||
foreach ($patch in $patches) {
|
||||
Set-Location -Path ${script:llamacppDir}
|
||||
git apply $patch.FullName
|
||||
}
|
||||
|
||||
# Avoid duplicate main symbols when we link into the cgo binary
|
||||
$content = Get-Content -Path "${script:llamacppDir}/examples/server/server.cpp"
|
||||
$content = $content -replace 'int main\(', 'int __main('
|
||||
Set-Content -Path "${script:llamacppDir}/examples/server/server.cpp" -Value $content
|
||||
}
|
||||
|
||||
function build {
|
||||
write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs"
|
||||
& cmake --version
|
||||
& cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
write-host "building with: cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })"
|
||||
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
}
|
||||
|
||||
function install {
|
||||
rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
|
||||
md "${script:buildDir}/lib" -ea 0 > $null
|
||||
cp "${script:buildDir}/bin/${script:config}/ext_server.dll" "${script:buildDir}/lib"
|
||||
cp "${script:buildDir}/bin/${script:config}/llama.dll" "${script:buildDir}/lib"
|
||||
# Display the dll dependencies in the build log
|
||||
if ($script:DUMPBIN -ne $null) {
|
||||
& "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
|
||||
}
|
||||
}
|
||||
|
||||
function sign {
|
||||
if ("${env:KEY_CONTAINER}") {
|
||||
write-host "Signing ${script:buildDir}/lib/*.dll"
|
||||
foreach ($file in (get-childitem "${script:buildDir}/lib/*.dll")){
|
||||
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
|
||||
/csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function compress_libs {
|
||||
if ($script:GZIP -eq $null) {
|
||||
write-host "gzip not installed, not compressing files"
|
||||
return
|
||||
}
|
||||
write-host "Compressing dlls..."
|
||||
$libs = dir "${script:buildDir}/lib/*.dll"
|
||||
foreach ($file in $libs) {
|
||||
& "$script:GZIP" --best -f $file
|
||||
}
|
||||
}
|
||||
|
||||
function cleanup {
|
||||
$patches = Get-ChildItem "../patches/*.diff"
|
||||
foreach ($patch in $patches) {
|
||||
# Extract file paths from the patch file
|
||||
$filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object {
|
||||
$parts = $_ -split ' '
|
||||
($parts[1] -split '/', 2)[1]
|
||||
}
|
||||
|
||||
# Checkout each file
|
||||
Set-Location -Path ${script:llamacppDir}
|
||||
foreach ($file in $filePaths) {
|
||||
git checkout $file
|
||||
}
|
||||
}
|
||||
Set-Location "${script:llamacppDir}/examples/server"
|
||||
git checkout CMakeLists.txt server.cpp
|
||||
|
||||
}
|
||||
|
||||
init_vars
|
||||
git_module_setup
|
||||
apply_patches
|
||||
|
||||
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
|
||||
# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
|
||||
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
|
||||
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
|
||||
|
||||
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
|
||||
|
||||
init_vars
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu"
|
||||
write-host "Building LCD CPU"
|
||||
build
|
||||
install
|
||||
sign
|
||||
compress_libs
|
||||
|
||||
init_vars
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx"
|
||||
write-host "Building AVX CPU"
|
||||
build
|
||||
install
|
||||
sign
|
||||
compress_libs
|
||||
|
||||
init_vars
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx2"
|
||||
write-host "Building AVX2 CPU"
|
||||
build
|
||||
install
|
||||
sign
|
||||
compress_libs
|
||||
|
||||
if ($null -ne $script:CUDA_LIB_DIR) {
|
||||
# Then build cuda as a dynamically loaded library
|
||||
$nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
|
||||
$script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
|
||||
if ($null -ne $script:CUDA_VERSION) {
|
||||
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION
|
||||
}
|
||||
init_vars
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
|
||||
$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
|
||||
build
|
||||
install
|
||||
sign
|
||||
compress_libs
|
||||
}
|
||||
# TODO - actually implement ROCm support on windows
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm"
|
||||
|
||||
rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
|
||||
md "${script:buildDir}/lib" -ea 0 > $null
|
||||
echo $null >> "${script:buildDir}/lib/.generated"
|
||||
|
||||
cleanup
|
||||
write-host "`ngo generate completed"
|
@ -1,3 +0,0 @@
|
||||
package generate
|
||||
|
||||
//go:generate sh ./gen_darwin.sh
|
@ -1,3 +0,0 @@
|
||||
package generate
|
||||
|
||||
//go:generate bash ./gen_linux.sh
|
@ -1,3 +0,0 @@
|
||||
package generate
|
||||
|
||||
//go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1
|
@ -1 +0,0 @@
|
||||
Subproject commit c29af7e2252d288f2ea58a7d437c1cb7c0abf160
|
10
llm/llm.go
10
llm/llm.go
@ -6,6 +6,7 @@ import (
|
||||
"log/slog"
|
||||
"os"
|
||||
"runtime"
|
||||
"time"
|
||||
|
||||
"github.com/jmorganca/ollama/api"
|
||||
"github.com/jmorganca/ollama/gpu"
|
||||
@ -165,3 +166,12 @@ func newLlmServer(gpuInfo gpu.GpuInfo, workDir, model string, adapters, projecto
|
||||
|
||||
return nil, err2
|
||||
}
|
||||
|
||||
func parseDurationMs(ms float64) time.Duration {
|
||||
dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
return dur
|
||||
}
|
||||
|
14
llm/llm_darwin_amd64.go
Normal file
14
llm/llm_darwin_amd64.go
Normal file
@ -0,0 +1,14 @@
|
||||
//go:generate cmake -S server -B server/build/cpu -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_METAL=off -DLLAMA_NATIVE=off
|
||||
//go:generate cmake -S server -B server/build/cpu_avx -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_METAL=off -DLLAMA_NATIVE=off -DLLAMA_AVX=on
|
||||
//go:generate cmake -S server -B server/build/cpu_avx2 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_METAL=off -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=on
|
||||
//go:generate cmake --build server/build/cpu --target server -- -j4
|
||||
//go:generate cmake --build server/build/cpu_avx --target server -- -j4
|
||||
//go:generate cmake --build server/build/cpu_avx2 --target server -- -j4
|
||||
package llm
|
||||
|
||||
import "embed"
|
||||
|
||||
//go:embed server/build/cpu/server
|
||||
//go:embed server/build/cpu_avx/server
|
||||
//go:embed server/build/cpu_avx2/server
|
||||
var libEmbed embed.FS
|
8
llm/llm_darwin_arm64.go
Normal file
8
llm/llm_darwin_arm64.go
Normal file
@ -0,0 +1,8 @@
|
||||
//go:generate cmake -S server -B server/build/metal -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64
|
||||
//go:generate cmake --build server/build/metal --target server -- -j4
|
||||
package llm
|
||||
|
||||
import "embed"
|
||||
|
||||
//go:embed server/build/metal/ggml-metal.metal server/build/metal/server
|
||||
var libEmbed embed.FS
|
@ -1,114 +0,0 @@
|
||||
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
|
||||
index 2b2f4a0f..25857bdd 100644
|
||||
--- a/examples/server/server.cpp
|
||||
+++ b/examples/server/server.cpp
|
||||
@@ -31,6 +31,10 @@
|
||||
#include <atomic>
|
||||
#include <signal.h>
|
||||
|
||||
+#ifdef GGML_USE_CUBLAS
|
||||
+extern "C" GGML_CALL void ggml_free_cublas(void);
|
||||
+#endif
|
||||
+
|
||||
using json = nlohmann::json;
|
||||
|
||||
struct server_params {
|
||||
@@ -363,6 +367,9 @@ struct llama_server_context
|
||||
llama_free_model(model);
|
||||
model = nullptr;
|
||||
}
|
||||
+#ifdef GGML_USE_CUBLAS
|
||||
+ ggml_free_cublas();
|
||||
+#endif
|
||||
}
|
||||
|
||||
bool load_model(const gpt_params ¶ms_)
|
||||
@@ -3494,6 +3501,7 @@ int main(int argc, char **argv)
|
||||
sigemptyset (&sigint_action.sa_mask);
|
||||
sigint_action.sa_flags = 0;
|
||||
sigaction(SIGINT, &sigint_action, NULL);
|
||||
+ sigaction(SIGUSR1, &sigint_action, NULL);
|
||||
#elif defined (_WIN32)
|
||||
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
|
||||
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
|
||||
index 0c6501e9..75c12723 100644
|
||||
--- a/ggml-cuda.cu
|
||||
+++ b/ggml-cuda.cu
|
||||
@@ -43,6 +43,7 @@
|
||||
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
|
||||
#define cublasCreate hipblasCreate
|
||||
+#define cublasDestroy hipblasDestroy
|
||||
#define cublasGemmEx hipblasGemmEx
|
||||
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
||||
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
|
||||
@@ -8694,10 +8695,10 @@ GGML_CALL bool ggml_cublas_loaded(void) {
|
||||
return g_cublas_loaded;
|
||||
}
|
||||
|
||||
-GGML_CALL void ggml_init_cublas() {
|
||||
- static bool initialized = false;
|
||||
+static bool g_cublas_initialized = false;
|
||||
|
||||
- if (!initialized) {
|
||||
+GGML_CALL void ggml_init_cublas() {
|
||||
+ if (!g_cublas_initialized) {
|
||||
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
||||
@@ -8707,7 +8708,7 @@ GGML_CALL void ggml_init_cublas() {
|
||||
#endif
|
||||
|
||||
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
|
||||
- initialized = true;
|
||||
+ g_cublas_initialized = true;
|
||||
g_cublas_loaded = false;
|
||||
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
|
||||
return;
|
||||
@@ -8778,7 +8779,7 @@ GGML_CALL void ggml_init_cublas() {
|
||||
// configure logging to stdout
|
||||
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
|
||||
|
||||
- initialized = true;
|
||||
+ g_cublas_initialized = true;
|
||||
g_cublas_loaded = true;
|
||||
}
|
||||
}
|
||||
@@ -12345,3 +12346,22 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
|
||||
}
|
||||
return device_count;
|
||||
}
|
||||
+
|
||||
+extern "C" GGML_CALL void ggml_free_cublas(void);
|
||||
+GGML_CALL void ggml_free_cublas(void) {
|
||||
+ for (int id = 0; id < g_device_count; ++id) {
|
||||
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
+ if (g_device_caps[id].vmm) {
|
||||
+ CU_CHECK(cuMemUnmap(g_cuda_pool_addr[id], g_cuda_pool_size[id]));
|
||||
+ g_cuda_pool_size[id] = 0;
|
||||
+ g_cuda_pool_addr[id] = 0;
|
||||
+ }
|
||||
+#endif
|
||||
+ // TODO: free legacy non-vmm memory
|
||||
+ // destroy cublas handle
|
||||
+ CUBLAS_CHECK(cublasDestroy(g_cublas_handles[id]));
|
||||
+ g_cublas_handles[id] = nullptr;
|
||||
+ }
|
||||
+
|
||||
+ g_cublas_initialized = false;
|
||||
+}
|
||||
diff --git a/ggml-cuda.h b/ggml-cuda.h
|
||||
index b1ebd61d..6dd58ddf 100644
|
||||
--- a/ggml-cuda.h
|
||||
+++ b/ggml-cuda.h
|
||||
@@ -23,6 +23,9 @@ GGML_API GGML_CALL void ggml_init_cublas(void);
|
||||
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
|
||||
GGML_API GGML_CALL bool ggml_cublas_loaded(void);
|
||||
|
||||
+// Release CUDA resources
|
||||
+GGML_API GGML_CALL void ggml_free_cublas(void);
|
||||
+
|
||||
GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
|
||||
GGML_API GGML_CALL void ggml_cuda_host_free(void * ptr);
|
||||
|
@ -1,8 +0,0 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"embed"
|
||||
)
|
||||
|
||||
//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/x86_64/*/lib/*.dylib*
|
||||
var libEmbed embed.FS
|
@ -1,8 +0,0 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"embed"
|
||||
)
|
||||
|
||||
//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/arm64/*/lib/*.dylib*
|
||||
var libEmbed embed.FS
|
@ -1,8 +0,0 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"embed"
|
||||
)
|
||||
|
||||
//go:embed llama.cpp/build/linux/*/*/lib/*.so*
|
||||
var libEmbed embed.FS
|
@ -1,58 +0,0 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/jmorganca/ollama/gpu"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestGetDynLibs(t *testing.T) {
|
||||
availableDynLibs = map[string]string{
|
||||
"cpu": "X_cpu",
|
||||
}
|
||||
assert.Equal(t, false, rocmDynLibPresent())
|
||||
res := getDynLibs(gpu.GpuInfo{Library: "cpu"})
|
||||
assert.Len(t, res, 1)
|
||||
assert.Equal(t, availableDynLibs["cpu"], res[0])
|
||||
|
||||
variant := gpu.GetCPUVariant()
|
||||
if variant != "" {
|
||||
variant = "_" + variant
|
||||
}
|
||||
availableDynLibs = map[string]string{
|
||||
"rocm_v5": "X_rocm_v5",
|
||||
"rocm_v6": "X_rocm_v6",
|
||||
"cpu" + variant: "X_cpu",
|
||||
}
|
||||
assert.Equal(t, true, rocmDynLibPresent())
|
||||
res = getDynLibs(gpu.GpuInfo{Library: "rocm"})
|
||||
assert.Len(t, res, 3)
|
||||
assert.Equal(t, availableDynLibs["rocm_v5"], res[0])
|
||||
assert.Equal(t, availableDynLibs["rocm_v6"], res[1])
|
||||
assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
|
||||
|
||||
res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
|
||||
assert.Len(t, res, 3)
|
||||
assert.Equal(t, availableDynLibs["rocm_v6"], res[0])
|
||||
assert.Equal(t, availableDynLibs["rocm_v5"], res[1])
|
||||
assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
|
||||
|
||||
res = getDynLibs(gpu.GpuInfo{Library: "cuda"})
|
||||
assert.Len(t, res, 1)
|
||||
assert.Equal(t, availableDynLibs["cpu"+variant], res[0])
|
||||
|
||||
res = getDynLibs(gpu.GpuInfo{Library: "default"})
|
||||
assert.Len(t, res, 1)
|
||||
assert.Equal(t, "default", res[0])
|
||||
|
||||
availableDynLibs = map[string]string{
|
||||
"rocm": "X_rocm_v5",
|
||||
"cpu" + variant: "X_cpu",
|
||||
}
|
||||
assert.Equal(t, true, rocmDynLibPresent())
|
||||
res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
|
||||
assert.Len(t, res, 2)
|
||||
assert.Equal(t, availableDynLibs["rocm"], res[0])
|
||||
assert.Equal(t, availableDynLibs["cpu"+variant], res[1])
|
||||
}
|
@ -1,8 +0,0 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"embed"
|
||||
)
|
||||
|
||||
//go:embed llama.cpp/build/windows/*/*/lib/*.dll*
|
||||
var libEmbed embed.FS
|
1
llm/server/.gitignore
vendored
Normal file
1
llm/server/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
build
|
93
llm/server/CMakeLists.txt
Normal file
93
llm/server/CMakeLists.txt
Normal file
@ -0,0 +1,93 @@
|
||||
cmake_minimum_required(VERSION 3.14)
|
||||
|
||||
project(llm)
|
||||
|
||||
include(FetchContent)
|
||||
|
||||
set(add_token_patch
|
||||
git apply ${CMAKE_CURRENT_SOURCE_DIR}/patches/add_token.patch
|
||||
)
|
||||
|
||||
set(FETCHCONTENT_BASE_DIR "${CMAKE_SOURCE_DIR}/build/llama.cpp")
|
||||
|
||||
FetchContent_Declare(
|
||||
llama_cpp
|
||||
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
|
||||
GIT_TAG c29af7e2252d288f2ea58a7d437c1cb7c0abf160
|
||||
|
||||
# this could be risky if the patch doesn't apply
|
||||
PATCH_COMMAND ${add_token_patch} || true
|
||||
)
|
||||
|
||||
FetchContent_MakeAvailable(llama_cpp)
|
||||
add_subdirectory(${llama_cpp_SOURCE_DIR}/examples/llava)
|
||||
|
||||
# code signing
|
||||
function(sign target)
|
||||
if(APPLE)
|
||||
if(DEFINED ENV{APPLE_IDENTITY})
|
||||
add_custom_command(TARGET ${target} POST_BUILD
|
||||
COMMAND codesign
|
||||
-f
|
||||
--timestamp
|
||||
--deep
|
||||
--options=runtime
|
||||
--sign "$ENV{APPLE_IDENTITY}"
|
||||
--identifier ai.ollama.ollama
|
||||
$<TARGET_FILE:${target}>
|
||||
COMMENT "Signing macOS binary: ${target}"
|
||||
)
|
||||
endif()
|
||||
elseif(WIN32)
|
||||
find_program(SIGNTOOL_EXE NAMES signtool PATHS "C:\\Program Files (x86)\\Windows Kits\\8.1\\bin\\x64" NO_DEFAULT_PATH)
|
||||
set(KEY_CONTAINER "$ENV{KEY_CONTAINER}")
|
||||
set(OLLAMA_CERT "$ENV{OLLAMA_CERT}")
|
||||
|
||||
if(SIGNTOOL_EXE AND KEY_CONTAINER AND OLLAMA_CERT)
|
||||
add_custom_command(TARGET ${target} POST_BUILD
|
||||
COMMAND "${SIGNTOOL_EXE}"
|
||||
"sign"
|
||||
"/v"
|
||||
"/fd" "sha256"
|
||||
"/t" "http://timestamp.digicert.com"
|
||||
"/f" "${OLLAMA_CERT}"
|
||||
"/csp" "Google Cloud KMS Provider"
|
||||
"/kc" "${KEY_CONTAINER}"
|
||||
"$<TARGET_FILE:${target}>"
|
||||
COMMENT "Signing Windows binary: ${target}"
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
set(CMAKE_CUDA_ARCHITECTURES "50;52;61;70;75;80")
|
||||
|
||||
function(gzip target)
|
||||
set(gzip_target "gzip_${target}")
|
||||
add_custom_target(${gzip_target} ALL
|
||||
COMMAND gzip -k -f ${target}
|
||||
COMMENT "Gzipping ${target}"
|
||||
VERBATIM
|
||||
)
|
||||
add_dependencies(${gzip_target} ${target})
|
||||
endfunction()
|
||||
|
||||
function(link_windows_libraries target)
|
||||
if (WIN32)
|
||||
target_link_libraries(${target} PRIVATE ws2_32)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
add_executable(server ${llama_cpp_SOURCE_DIR}/examples/server/server.cpp ${llama_cpp_SOURCE_DIR})
|
||||
target_compile_definitions(server PRIVATE)
|
||||
target_link_libraries(server PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(server PRIVATE cxx_std_17)
|
||||
link_windows_libraries(server)
|
||||
sign(server)
|
||||
gzip(server)
|
||||
|
||||
if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
|
||||
configure_file(${llama_cpp_SOURCE_DIR}/ggml-metal.metal ${CMAKE_BINARY_DIR}/ggml-metal.metal COPYONLY)
|
||||
endif()
|
||||
|
||||
# TODO: ROCm
|
15
llm/utils.go
15
llm/utils.go
@ -1,15 +0,0 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
)
|
||||
|
||||
func parseDurationMs(ms float64) time.Duration {
|
||||
dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
return dur
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user