From 71e76f8c90c1aa02aeebdaaec532c7d55c7b63b0 Mon Sep 17 00:00:00 2001 From: jmorganca Date: Thu, 26 Sep 2024 23:53:12 -0700 Subject: [PATCH] server.cpp: cleanup cross attention state --- llm/ext_server/server.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index f1751c60..b0f45fe7 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -729,6 +729,10 @@ struct llama_server_context slot->sparams.samplers_sequence = default_sparams.samplers_sequence; } + // Check for mllama architecture, which processes images differently than llava + char arch_str[256]; + llama_model_meta_val_str(model, "general.architecture", arch_str, 256); + bool is_mllama = strcmp(arch_str, "mllama") == 0; if (multimodal) { const auto &images_data = data.find("image_data"); @@ -738,11 +742,6 @@ struct llama_server_context { const std::vector image_buffer = base64_decode(img["data"].get()); - // Check for mllama architecture, which processes images differently than llava - char arch_str[256]; - llama_model_meta_val_str(model, "general.architecture", arch_str, 256); - bool is_mllama = strcmp(arch_str, "mllama") == 0; - if (is_mllama) { LOG_INFO("MLLAMA architecture detected, processing first image", {{"slot_id", slot->id}}); @@ -820,6 +819,8 @@ struct llama_server_context slot->params.input_suffix = prompt.substr(begin_prefix); slot->params.cache_prompt = false; // multimodal doesn't support cache prompt } + } else { + llama_set_cross_attn_state(ctx, nullptr); } } @@ -1496,6 +1497,7 @@ struct llama_server_context { if (slot.task_id == task.target_id) { + slot.reset(); slot.release(); break; }