Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add supports for Janus vision encoder and projector [WIP] #11646

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 118 additions & 0 deletions examples/llava/clip-cli.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
//
// Example usage of just the vision encoder (CLIP) part of the LLAVA codebase.
// It loads a CLIP model (gguf file) and an image file,
// computes the image embedding, and prints out (a few elements of) the embedding.
//
// Build and run (for example):
// ./bin/llama-clip-cli -c model.gguf -i input.png --threads 1 --verbosity 1
// ./bin/llama-clip-cli -c clip.gguf -i input.png --threads 1 --verbosity 1

#include "arg.h"
#include "base64.hpp"
#include "log.h"
#include "common.h"
#include "clip.h"
#include "llava.h"
#include "ggml.h"

#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <string>
#include <vector>
#include <algorithm>

// Structure to hold our command line parameters.
struct vision_params {
std::string clip_model; // Path to the CLIP model file (gguf)
std::string image_file; // Path to the image file to process
int n_threads = 1; // Number of CPU threads to use
int verbosity = 1; // Verbosity level for model loading
};

static void print_usage(const char* progname) {
LOG("\nUsage: %s -c <clip_model_path> -i <image_file> [--threads <n_threads>] [--verbosity <level>]\n\n", progname);
}

int main(int argc, char ** argv) {
ggml_time_init();

vision_params params;

// Simple command line parsing
if (argc < 5) {
print_usage(argv[0]);
return 1;
}
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg == "-c" || arg == "--clip") {
if (i + 1 < argc) {
params.clip_model = argv[++i];
} else {
print_usage(argv[0]);
return 1;
}
} else if (arg == "-i" || arg == "--image") {
if (i + 1 < argc) {
params.image_file = argv[++i];
} else {
print_usage(argv[0]);
return 1;
}
} else if (arg == "--threads") {
if (i + 1 < argc) {
params.n_threads = std::atoi(argv[++i]);
} else {
print_usage(argv[0]);
return 1;
}
} else if (arg == "--verbosity") {
if (i + 1 < argc) {
params.verbosity = std::atoi(argv[++i]);
} else {
print_usage(argv[0]);
return 1;
}
} else {
// Unknown argument.
print_usage(argv[0]);
return 1;
}
}

if (params.clip_model.empty() || params.image_file.empty()) {
print_usage(argv[0]);
return 1;
}

// Load the CLIP model.
struct clip_ctx * ctx_clip = clip_model_load(params.clip_model.c_str(), params.verbosity);
if (!ctx_clip) {
LOG_ERR("Failed to load clip model from %s\n", params.clip_model.c_str());
return 1;
}
LOG_INF("Clip model loaded from %s\n", params.clip_model.c_str());

// Load and process the image.
llava_image_embed * embed = llava_image_embed_make_with_filename(ctx_clip, params.n_threads, params.image_file.c_str());
if (!embed) {
LOG_ERR("Failed to load or process image from %s\n", params.image_file.c_str());
clip_free(ctx_clip);
return 1;
}
LOG_INF("Image loaded and processed from %s\n", params.image_file.c_str());
LOG_INF("Image embedding computed with %d positions.\n", embed->n_image_pos);
int print_count = (embed->n_image_pos < 10 ? embed->n_image_pos : 10);
LOG_INF("First %d elements: ", print_count);

for (int i = 0; i < print_count; i++) {
LOG_INF("%f ", embed->embed[i]);
}
LOG_INF("\n");

llava_image_embed_free(embed);
clip_free(ctx_clip);

return 0;
}
129 changes: 129 additions & 0 deletions examples/llava/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ static std::string format(const char * fmt, ...) {
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
#define KEY_HAS_GLM_PROJ "clip.has_glm_projector"
#define KEY_HAS_JANUS_ATTN_POOL "clip.has_janus_attn_pool"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
#define KEY_USE_GELU "clip.use_gelu"
Expand Down Expand Up @@ -170,6 +171,15 @@ static std::string format(const char * fmt, ...) {
#define TN_GLM_BOI_W "adapter.boi"
#define TN_GLM_EOI_W "adapter.eoi"

#define TN_JANUS_ATTN_POOL_LATENT "v.attn_pool.latent"
#define TN_JANUS_ATTN_POOL_Q "v.attn_pool.q.%s"
#define TN_JANUS_ATTN_POOL_K "v.attn_pool.k.%s"
#define TN_JANUS_ATTN_POOL_V "v.attn_pool.v.%s"
#define TN_JANUS_ATTN_POOL_PROJ "v.attn_pool.proj.%s"
#define TN_JANUS_ATTN_POOL_FFN_DOWN "v.attn_pool.ffn_down.%s"
#define TN_JANUS_ATTN_POOL_NORM "v.attn_pool.norm.%s"
#define TN_JANUS_ATTN_POOL_FFN_UP "v.attn_pool.ffn_up.%s"


enum projector_type {
PROJECTOR_TYPE_MLP,
Expand All @@ -179,6 +189,7 @@ enum projector_type {
PROJECTOR_TYPE_RESAMPLER,
PROJECTOR_TYPE_GLM_EDGE,
PROJECTOR_TYPE_MERGER,
PROJECTOR_TYPE_ATTN_POOL,
PROJECTOR_TYPE_UNKNOWN,
};

Expand All @@ -189,6 +200,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
{ PROJECTOR_TYPE_ATTN_POOL, "janus_attn_pool"},
};


Expand Down Expand Up @@ -571,6 +583,23 @@ struct clip_vision_model {
struct ggml_tensor * mm_model_ln_kv_b;
struct ggml_tensor * mm_model_ln_post_w;
struct ggml_tensor * mm_model_ln_post_b;

// Janus Attention Pool with Latent Query
struct ggml_tensor * attn_pool_latent;
struct ggml_tensor * attn_pool_q_w;
struct ggml_tensor * attn_pool_q_b;
struct ggml_tensor * attn_pool_k_w;
struct ggml_tensor * attn_pool_k_b;
struct ggml_tensor * attn_pool_v_w;
struct ggml_tensor * attn_pool_v_b;
struct ggml_tensor * attn_pool_proj_w;
struct ggml_tensor * attn_pool_proj_b;
struct ggml_tensor * attn_pool_norm_w;
struct ggml_tensor * attn_pool_norm_b;
struct ggml_tensor * attn_pool_ffn_up_w;
struct ggml_tensor * attn_pool_ffn_up_b;
struct ggml_tensor * attn_pool_ffn_down_w;
struct ggml_tensor * attn_pool_ffn_down_b;
};

struct clip_ctx {
Expand All @@ -580,6 +609,7 @@ struct clip_ctx {
bool has_minicpmv_projector = false;
bool has_glm_projector = false;
bool has_qwen2vl_merger = false;
bool has_janus_attn_pool = false;
int minicpmv_version = 2;

struct clip_vision_model vision_model;
Expand Down Expand Up @@ -1153,6 +1183,78 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
}

// janus attn pool with latent query
// TODO: Check the ctx0 and memory usage
else if (ctx->has_janus_attn_pool){
if (ctx->proj_type == PROJECTOR_TYPE_ATTN_POOL) {
struct ggml_tensor* latent = model.attn_pool_latent;
struct ggml_tensor* latent_expanded = ggml_repeat(ctx0, latent,
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, 1, batch_size));

struct ggml_tensor* Q = ggml_add(ctx0,
ggml_mul_mat(ctx0, model.attn_pool_q_w, latent_expanded),
model.attn_pool_q_b
);
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, 1, batch_size);
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
Q = ggml_cont(ggml_permute(ctx0, Q, 0, 2, 1, 3));
Q = ggml_reshape_3d(ctx0, Q, d_head, 1, n_head * batch_size);

struct ggml_tensor* K = ggml_add(ctx0,
ggml_mul_mat(ctx0, model.attn_pool_k_w, embeddings),
model.attn_pool_k_b
);
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
K = ggml_cont(ggml_permute(ctx0, K, 0, 2, 1, 3));
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);

struct ggml_tensor* V = ggml_add(ctx0,
ggml_mul_mat(ctx0, model.attn_pool_v_w, embeddings),
model.attn_pool_v_b
);
V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
V = ggml_cont(ggml_permute(ctx0, V, 1, 2, 0, 3));
V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);

struct ggml_tensor* KQ = ggml_mul_mat(ctx0, K, Q);
KQ = ggml_soft_max_inplace(ctx0, KQ);

struct ggml_tensor* KQV = ggml_mul_mat(ctx0, V, KQ);
KQV = ggml_reshape_4d(ctx0, KQV, d_head, 1, n_head, batch_size);
KQV = ggml_cont(ggml_permute(ctx0, KQV, 0, 2, 1, 3));

KQV = ggml_cont_3d(ctx0, KQV, hidden_size, 1, batch_size);
KQV = ggml_add(ctx0,
ggml_mul_mat(ctx0, model.attn_pool_proj_w, KQV),
model.attn_pool_proj_b
);
// Norm before MLP
KQV = ggml_norm(ctx0, KQV, eps);
KQV = ggml_add(ctx0, ggml_mul(ctx0, KQV, model.attn_pool_norm_w), model.attn_pool_norm_b);

// MLP: fc1 -> gelu -> fc2
// References:
// /~https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/mlp.py#L13
struct ggml_tensor * embeddings = KQV;
embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.attn_pool_ffn_down_w, embeddings), model.attn_pool_ffn_down_b);
embeddings = ggml_gelu_inplace(ctx0, embeddings);
embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.attn_pool_ffn_up_w, embeddings), model.attn_pool_ffn_up_b);
// Residual connection
// /~https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/attention_pool.py#L98
embeddings = ggml_add(ctx0, embeddings, KQV);

// Pooling, select first token
embeddings = ggml_view_2d(ctx0,
embeddings,
embeddings->ne[0],
embeddings->ne[2],
embeddings->nb[2],
0);
} else {
GGML_ABORT("fatal error");
}
}

// build the graph
ggml_build_forward_expand(gf, embeddings);

Expand Down Expand Up @@ -1337,6 +1439,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
if (idx != -1) {
new_clip->has_qwen2vl_merger = gguf_get_val_bool(ctx, idx);
}
idx = gguf_find_key(ctx, KEY_HAS_JANUS_ATTN_POOL);
if (idx != -1) {
new_clip->has_janus_attn_pool = gguf_get_val_bool(ctx, idx);
}
// GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search

GGML_ASSERT(new_clip->has_vision_encoder);
Expand All @@ -1358,6 +1464,8 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
LOG_INF("%s: glm_projector: %d\n", __func__, new_clip->has_glm_projector);
LOG_INF("%s: qwen2vl_merger: %d\n", __func__, new_clip->has_qwen2vl_merger);
LOG_INF("%s: janus_attn_pool: %d\n", __func__, new_clip->has_janus_attn_pool);
LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
}
Expand Down Expand Up @@ -1643,6 +1751,24 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
}
else if (new_clip->proj_type == KEY_HAS_JANUS_ATTN_POOL) {
vision_model.attn_pool_latent = get_tensor(new_clip->ctx_data, TN_JANUS_ATTN_POOL_LATENT);
vision_model.attn_pool_q_w = get_tensor(new_clip->ctx_data, format(TN_JANUS_ATTN_POOL_Q, "weight"));
vision_model.attn_pool_q_b = get_tensor(new_clip->ctx_data, format(TN_JANUS_ATTN_POOL_Q, "bias"));
vision_model.attn_pool_k_w = get_tensor(new_clip->ctx_data, format(TN_JANUS_ATTN_POOL_K, "weight"));
vision_model.attn_pool_k_b = get_tensor(new_clip->ctx_data, format(TN_JANUS_ATTN_POOL_K, "bias"));
vision_model.attn_pool_v_w = get_tensor(new_clip->ctx_data, format(TN_JANUS_ATTN_POOL_V, "weight"));
vision_model.attn_pool_v_b = get_tensor(new_clip->ctx_data, format(TN_JANUS_ATTN_POOL_V, "bias"));
vision_model.attn_pool_proj_w = get_tensor(new_clip->ctx_data, format(TN_JANUS_ATTN_POOL_PROJ, "weight"));
vision_model.attn_pool_proj_b = get_tensor(new_clip->ctx_data, format(TN_JANUS_ATTN_POOL_PROJ, "bias"));
vision_model.attn_pool_ffn_down_w = get_tensor(new_clip->ctx_data, format(TN_JANUS_ATTN_POOL_FFN_DOWN, "weight"));
vision_model.attn_pool_ffn_down_b = get_tensor(new_clip->ctx_data, format(TN_JANUS_ATTN_POOL_FFN_DOWN, "bias"));
vision_model.attn_pool_norm_w = get_tensor(new_clip->ctx_data, format(TN_JANUS_ATTN_POOL_NORM, "weight"));
vision_model.attn_pool_norm_b = get_tensor(new_clip->ctx_data, format(TN_JANUS_ATTN_POOL_NORM, "bias"));
vision_model.attn_pool_ffn_up_w = get_tensor(new_clip->ctx_data, format(TN_JANUS_ATTN_POOL_FFN_UP, "weight"));
vision_model.attn_pool_ffn_up_b = get_tensor(new_clip->ctx_data, format(TN_JANUS_ATTN_POOL_FFN_UP, "bias"));

}
else {
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
Expand Down Expand Up @@ -2908,6 +3034,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
return ctx->vision_model.mm_1_b->ne[0];
}
if (ctx->proj_type == PROJECTOR_TYPE_ATTN_POOL) {
return ctx->vision_model.attn_pool_ffn_up_b->ne[0];
}

std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
Expand Down
Loading