diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 7b281977..318b581a 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -1032,18 +1032,6 @@ struct llama_server_context bool process_images(server_slot &slot) const { - // Set cross attention state for mllama models - // TODO (jmorganca): this should be provided via the API - // TODO (jmorganca): generalize this beyond mllama models - char arch_str[256]; - llama_model_meta_val_str(model, "general.architecture", arch_str, 256); - if (strcmp(arch_str, "mllama") == 0) { - // TODO (jmorganca): this should be passed in via the llama_decode api - // or similar, maybe using the llama_batch struct - // llama_reset_cross_attn_state(ctx); - // llama_set_cross_attn_state(ctx, (float*)cross_attn_state); - } - for (slot_image &img : slot.images) { if (!img.request_encode_image) @@ -1258,6 +1246,17 @@ struct llama_server_context task.type = TASK_TYPE_COMPLETION; task.multitask_id = multitask_id; + // Set cross attention state for mllama models + // TODO (jmorganca): this should be provided via the API + // TODO (jmorganca): generalize this beyond mllama models + char arch_str[256]; + llama_model_meta_val_str(model, "general.architecture", arch_str, 256); + if (strcmp(arch_str, "mllama") == 0) { + // TODO (jmorganca): this should be passed in via the llama_decode api + // or similar, maybe using the llama_batch struct + // llama_set_cross_attn_state(ctx, (float*)cross_attn_state); + } + // when a completion task's prompt array is not a singleton, we split it into multiple requests // otherwise, it's a single-prompt task, we actually queue it // if there's numbers in the prompt array it will be treated as an array of tokens