forked from third-party-mirrors/ollama
79 lines
3.1 KiB
Diff
79 lines
3.1 KiB
Diff
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
|
index 4f6cd687..b8c6896b 100644
|
|
--- a/CMakeLists.txt
|
|
+++ b/CMakeLists.txt
|
|
@@ -189,3 +189,4 @@ if (LLAMA_BUILD_EXAMPLES)
|
|
add_subdirectory(examples)
|
|
add_subdirectory(pocs)
|
|
endif()
|
|
+add_subdirectory(../ext_server ext_server) # ollama
|
|
diff --git a/src/llama.cpp b/src/llama.cpp
|
|
index 2b9ace28..b0151571 100644
|
|
--- a/src/llama.cpp
|
|
+++ b/src/llama.cpp
|
|
@@ -18609,6 +18609,20 @@ static int llama_apply_lora_from_file_internal(
|
|
return 1;
|
|
}
|
|
|
|
+ // show tensor data
|
|
+ auto show_tensor = [](std::string name, ggml_tensor *t) {
|
|
+ LLAMA_LOG_INFO("%s\n", name.c_str());
|
|
+
|
|
+ for(int i=0; i<3; i++) {
|
|
+ for(int j=0; j<3; j++) {
|
|
+ float v = ggml_get_f32_nd(t, i, j, 0, 0);
|
|
+ LLAMA_LOG_INFO("%.8f ", v);
|
|
+ }
|
|
+ LLAMA_LOG_INFO(" ...\n");
|
|
+ }
|
|
+ LLAMA_LOG_INFO(" ...\n");
|
|
+ };
|
|
+
|
|
// load tensor data
|
|
auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
|
|
read_buf.resize(ggml_nbytes(tensor));
|
|
@@ -18619,6 +18633,9 @@ static int llama_apply_lora_from_file_internal(
|
|
load_tensor(metaA, loraA);
|
|
load_tensor(metaB, loraB);
|
|
|
|
+ show_tensor(base_name + ".loraA", loraA);
|
|
+ show_tensor(base_name + ".loraB", loraB);
|
|
+
|
|
// load base model tensor data
|
|
if (ml) {
|
|
ml->load_data_for(base_t);
|
|
@@ -18633,8 +18650,10 @@ static int llama_apply_lora_from_file_internal(
|
|
}
|
|
|
|
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
|
- LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
|
- " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
|
|
+ LLAMA_LOG_ERROR("%s: incompatible tensors: base [%lld, %lld] loraA [%lld, %lld] loraB [%lld, %lld]\n", __func__,
|
|
+ base_t->ne[0], base_t->ne[1],
|
|
+ loraA->ne[0], loraA->ne[1],
|
|
+ loraB->ne[0], loraB->ne[1]);
|
|
ggml_free(lora_ctx);
|
|
ggml_backend_buffer_free(lora_buf);
|
|
ggml_backend_free(backend_cpu);
|
|
@@ -18643,14 +18662,18 @@ static int llama_apply_lora_from_file_internal(
|
|
|
|
auto build_lora_graph = [&]() {
|
|
// w = w + BA*s
|
|
- ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
|
+ ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraB, loraA);
|
|
ggml_set_name(BA, "BA");
|
|
|
|
if (scaling != 1.0f) {
|
|
- BA = ggml_scale(lora_ctx, BA, scaling);
|
|
+ //BA = ggml_scale(lora_ctx, BA, scaling);
|
|
+ BA = ggml_scale(lora_ctx, BA, 20.0);
|
|
ggml_set_name(BA, "BA_scaled");
|
|
}
|
|
|
|
+ // transpose matrix before we add
|
|
+ BA = ggml_cont(lora_ctx, ggml_transpose(lora_ctx, BA));
|
|
+
|
|
ggml_tensor * r;
|
|
r = ggml_add_inplace(lora_ctx, base_t, BA);
|
|
ggml_set_name(r, "r_add");
|