llm: add server entrypoint for mllama

2024-09-25 14:37:28 -07:00 · 2024-09-25 14:37:28 -07:00 · d0c8ce5ea4
commit d0c8ce5ea4
parent 8ac915f709
2 changed files with 28 additions and 74 deletions
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@ -1032,6 +1032,18 @@ struct llama_server_context

    bool process_images(server_slot &slot) const
    {
+        // Set cross attention state for mllama models
+        // TODO (jmorganca): this should be provided via the API
+        // TODO (jmorganca): generalize this beyond mllama models
+        char arch_str[256];
+        llama_model_meta_val_str(model, "general.architecture", arch_str, 256);
+        if (strcmp(arch_str, "mllama") == 0) {
+            // TODO (jmorganca): this should be passed in via the llama_decode api
+            // or similar, maybe using the llama_batch struct
+            // llama_reset_cross_attn_state(ctx);
+            // llama_set_cross_attn_state(ctx, (float*)cross_attn_state);
+        }
+
        for (slot_image &img : slot.images)
        {
            if (!img.request_encode_image)
--- a/llm/patches/0009-mllama.patch
+++ b/llm/patches/0009-mllama.patch
@ -1,4 +1,4 @@
-From c2db1ad0fc86de189959b628021a970511e9c6f9 Mon Sep 17 00:00:00 2001
+From 9935fbbf26ad4d9ca7735ec6ba4c0a206c0c8329 Mon Sep 17 00:00:00 2001
 From: jmorganca <jmorganca@gmail.com>
 Date: Tue, 24 Sep 2024 11:53:40 -0700
 Subject: [PATCH] add mllama support
@ -13,8 +13,8 @@ kv cache once per run
 remaining is to implement the cross attention mask
 ---
 include/llama.h |   5 +
- src/llama.cpp   | 514 ++++++++++++++++++++++++++++++++++++++++++++++--
- 2 files changed, 499 insertions(+), 20 deletions(-)
+ src/llama.cpp   | 470 ++++++++++++++++++++++++++++++++++++++++++++++--
+ 2 files changed, 461 insertions(+), 14 deletions(-)

 diff --git a/include/llama.h b/include/llama.h
 index bfc37e88..94ce82a4 100644
@ -33,7 +33,7 @@ index bfc37e88..94ce82a4 100644
     LLAMA_API void llama_free(struct llama_context * ctx);
 
 diff --git a/src/llama.cpp b/src/llama.cpp
-index b7771f53..75bbc226 100644
+index b7771f53..72a57a38 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
@@ -170,6 +170,7 @@ static std::string format(const char * fmt, ...) {
@ -193,25 +193,6 @@ index b7771f53..75bbc226 100644
 };
 
 // very similar to llama_batch,
-@@ -2684,12 +2749,12 @@ struct llama_ubatch {
-     uint32_t n_seq_tokens; // tokens per sequence
-     uint32_t n_seqs;
- 
-    llama_token  *  token;    // [n_tokens]
-    float        *  embd;     // [n_embd, n_tokens]
-    llama_pos    *  pos;      // [n_tokens]
-    int32_t      *  n_seq_id; // [n_seqs]
-    llama_seq_id ** seq_id;   // [n_seqs]
-    int8_t       *  output;   // [n_tokens]
-+    llama_token  *  token;             // [n_tokens]
-+    float        *  embd;              // [n_embd, n_tokens]
-+    llama_pos    *  pos;               // [n_tokens]
-+    int32_t      *  n_seq_id;          // [n_seqs]
-+    llama_seq_id ** seq_id;            // [n_seqs]
-+    int8_t       *  output;            // [n_tokens]
- };
- 
- struct llama_kv_cell {
@@ -3268,6 +3333,10 @@ struct llama_context {
     // host buffer for the model output (logits and embeddings)
     ggml_backend_buffer_t buf_output = nullptr;
@ -404,48 +385,7 @@ index b7771f53..75bbc226 100644
 
     // note: storing RoPE-ed version of K in the KV cache
     ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
-@@ -9625,6 +9788,40 @@ static struct ggml_tensor * llm_build_rwkv6_channel_mix(
-     return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
- }
- 
-+
-+static void show_tensor(std::string name, ggml_tensor *t) {
-+    LLAMA_LOG_INFO("%s [%lld, %lld]\n", name.c_str(), t->ne[0], t->ne[1]);
-+
-+    int cols = int(t->ne[0]);
-+    int rows = int(t->ne[1]);
-+
-+    for(int r=0; r<3; r++) {
-+        for(int c=0; c<3; c++) {
-+            float v = ggml_get_f32_nd(t, c, r, 0, 0);
-+            LLAMA_LOG_INFO("%11.8f ", v);
-+        }
-+        LLAMA_LOG_INFO("... ");
-+        for(int c=0; c<3; c++) {
-+            float v = ggml_get_f32_nd(t, cols-3+c, r, 0, 0);
-+            LLAMA_LOG_INFO("%11.8f ", v);
-+        }
-+        LLAMA_LOG_INFO("\n");
-+    }
-+    LLAMA_LOG_INFO(" ...\n");
-+    for(int r=0; r<3; r++) {
-+        for(int c=0; c<3; c++) {
-+            float v = ggml_get_f32_nd(t, c, rows-3+r, 0, 0);
-+            LLAMA_LOG_INFO("%11.8f ", v);
-+        }
-+        LLAMA_LOG_INFO("... ");
-+        for(int c=0; c<3; c++) {
-+            float v = ggml_get_f32_nd(t, cols-3+c, rows-3+r, 0, 0);
-+            LLAMA_LOG_INFO("%11.8f ", v);
-+        }
-+        LLAMA_LOG_INFO("\n");
-+    }
-+}
-+
- struct llm_build_context {
-     const llama_model    & model;
-           llama_context  & lctx;
-@@ -9743,6 +9940,7 @@ struct llm_build_context {
+@@ -9743,6 +9906,7 @@ struct llm_build_context {
         lctx.inp_pos_bucket    = nullptr;
         lctx.inp_embd_enc      = nullptr;
         lctx.inp_KQ_mask_cross = nullptr;
@ -453,7 +393,7 @@ index b7771f53..75bbc226 100644
     }
 
     void free() {
-@@ -10158,6 +10356,253 @@ struct llm_build_context {
+@@ -10158,6 +10322,253 @@ struct llm_build_context {
                 LLM_NORM_RMS, cb, -1);
         cb(cur, "result_norm", -1);
 
@ -707,7 +647,7 @@ index b7771f53..75bbc226 100644
         // lm_head
         cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
-@@ -15493,6 +15938,10 @@ static struct ggml_cgraph * llama_build_graph(
+@@ -15493,6 +15904,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_llama();
             } break;
@ -718,7 +658,7 @@ index b7771f53..75bbc226 100644
         case LLM_ARCH_BAICHUAN:
             {
                 result = llm.build_baichuan();
-@@ -15736,7 +16185,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
+@@ -15736,7 +16151,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
 
     if (batch.token) {
         const int64_t n_tokens = batch.n_tokens;
@ -726,7 +666,7 @@ index b7771f53..75bbc226 100644
         ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
     }
 
-@@ -16123,6 +16571,13 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
+@@ -16123,6 +16537,15 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
             }
         }
     }
@ -734,13 +674,15 @@ index b7771f53..75bbc226 100644
 +    // TODO (jmorganca): this might copy a lot of data on every request of a
 +    // single generation even though it doesn't change, so we should
 +    // find a way to not set this more than one time per image
-+    if (lctx.cross_attn_state && lctx.inp_cross_attn_state->buffer) {
+    if (lctx.cross_attn_state &&
+        lctx.inp_cross_attn_state &&
+        lctx.inp_cross_attn_state->buffer) {
 +        ggml_backend_tensor_set(lctx.inp_cross_attn_state, lctx.cross_attn_state, 0, hparams.n_embd * 1601 * 4 * ggml_element_size(lctx.inp_cross_attn_state));
 +    }
 }
 
 // Make sure enough space is available for outputs.
-@@ -16430,6 +16885,10 @@ static int llama_decode_internal(
+@@ -16430,6 +16853,10 @@ static int llama_decode_internal(
 
         llama_set_inputs(lctx, ubatch);
 
@ -751,7 +693,7 @@ index b7771f53..75bbc226 100644
         llama_graph_compute(lctx, gf, n_threads, threadpool);
 
         // update the kv ring buffer
-@@ -17586,7 +18045,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
+@@ -17586,7 +18013,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         if (llama_model_has_encoder(&model)) {
             n_attn_layer *= 3;
         }
@ -762,7 +704,7 @@ index b7771f53..75bbc226 100644
     }
 
     size_t total_size_org = 0;
-@@ -18681,6 +19142,18 @@ struct llama_context * llama_new_context_with_model(
+@@ -18681,6 +19110,18 @@ struct llama_context * llama_new_context_with_model(
     return ctx;
 }
 
@ -781,7 +723,7 @@ index b7771f53..75bbc226 100644
 void llama_free(struct llama_context * ctx) {
     delete ctx;
 }
-@@ -18731,6 +19204,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
+@@ -18731,6 +19172,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
 
         // use what we call a normal RoPE, operating on pairs of consecutive head values
         case LLM_ARCH_LLAMA: