update server.cpp changes

This commit is contained in:
jmorganca 2024-09-25 21:54:23 -07:00
parent d0c8ce5ea4
commit 055cb6b0e2

View File

@ -1032,18 +1032,6 @@ struct llama_server_context
bool process_images(server_slot &slot) const
{
// Set cross attention state for mllama models
// TODO (jmorganca): this should be provided via the API
// TODO (jmorganca): generalize this beyond mllama models
char arch_str[256];
llama_model_meta_val_str(model, "general.architecture", arch_str, 256);
if (strcmp(arch_str, "mllama") == 0) {
// TODO (jmorganca): this should be passed in via the llama_decode api
// or similar, maybe using the llama_batch struct
// llama_reset_cross_attn_state(ctx);
// llama_set_cross_attn_state(ctx, (float*)cross_attn_state);
}
for (slot_image &img : slot.images)
{
if (!img.request_encode_image)
@ -1258,6 +1246,17 @@ struct llama_server_context
task.type = TASK_TYPE_COMPLETION;
task.multitask_id = multitask_id;
// Set cross attention state for mllama models
// TODO (jmorganca): this should be provided via the API
// TODO (jmorganca): generalize this beyond mllama models
char arch_str[256];
llama_model_meta_val_str(model, "general.architecture", arch_str, 256);
if (strcmp(arch_str, "mllama") == 0) {
// TODO (jmorganca): this should be passed in via the llama_decode api
// or similar, maybe using the llama_batch struct
// llama_set_cross_attn_state(ctx, (float*)cross_attn_state);
}
// when a completion task's prompt array is not a singleton, we split it into multiple requests
// otherwise, it's a single-prompt task, we actually queue it
// if there's numbers in the prompt array it will be treated as an array of tokens