From 71e76f8c90c1aa02aeebdaaec532c7d55c7b63b0 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Thu, 26 Sep 2024 23:53:12 -0700
Subject: [PATCH] server.cpp: cleanup cross attention state

---
 llm/ext_server/server.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp
index f1751c60..b0f45fe7 100644
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -729,6 +729,10 @@ struct llama_server_context
             slot->sparams.samplers_sequence = default_sparams.samplers_sequence;
         }
 
+        // Check for mllama architecture, which processes images differently than llava
+        char arch_str[256];
+        llama_model_meta_val_str(model, "general.architecture", arch_str, 256);
+        bool is_mllama = strcmp(arch_str, "mllama") == 0;
         if (multimodal)
         {
             const auto &images_data = data.find("image_data");
@@ -738,11 +742,6 @@ struct llama_server_context
                 {
                     const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
 
-                    // Check for mllama architecture, which processes images differently than llava
-                    char arch_str[256];
-                    llama_model_meta_val_str(model, "general.architecture", arch_str, 256);
-                    bool is_mllama = strcmp(arch_str, "mllama") == 0;
-
                     if (is_mllama) {
                         LOG_INFO("MLLAMA architecture detected, processing first image", {{"slot_id", slot->id}});
 
@@ -820,6 +819,8 @@ struct llama_server_context
                     slot->params.input_suffix = prompt.substr(begin_prefix);
                     slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
                 }
+            } else {
+                llama_set_cross_attn_state(ctx, nullptr);
             }
         }
 
@@ -1496,6 +1497,7 @@ struct llama_server_context
                 {
                     if (slot.task_id == task.target_id)
                     {
+                        slot.reset();
                         slot.release();
                         break;
                     }