From 2f89058f24f816b62a06ef3807db9c613d106cc4 Mon Sep 17 00:00:00 2001 From: leejet Date: Mon, 11 May 2026 01:01:13 +0800 Subject: [PATCH 1/2] feat: add hidream o1 image support --- src/conditioner.hpp | 24 +- src/denoiser.hpp | 4 +- src/diffusion_model.hpp | 84 +++ src/ggml_extend.hpp | 95 ++- src/hidream_o1.hpp | 975 +++++++++++++++++++++++++++++ src/llm.hpp | 117 ++-- src/model.cpp | 4 + src/model.h | 2 + src/stable-diffusion.cpp | 47 +- src/tokenizers/qwen2_tokenizer.cpp | 5 + src/vae.hpp | 2 +- 11 files changed, 1293 insertions(+), 66 deletions(-) create mode 100644 src/hidream_o1.hpp diff --git a/src/conditioner.hpp b/src/conditioner.hpp index 4907938b0..cc51718ad 100644 --- a/src/conditioner.hpp +++ b/src/conditioner.hpp @@ -14,6 +14,13 @@ struct SDCondition { sd::Tensor c_concat; sd::Tensor c_t5_ids; sd::Tensor c_t5_weights; + sd::Tensor c_input_ids; + sd::Tensor c_position_ids; + sd::Tensor c_token_types; + sd::Tensor c_image_embed_ranges; + sd::Tensor c_vinput_mask; + std::vector> c_vlm_images; + std::vector> c_ref_images; std::vector> extra_c_crossattns; @@ -26,10 +33,25 @@ struct SDCondition { bool empty() const { if (!c_crossattn.empty() || !c_vector.empty() || !c_concat.empty() || - !c_t5_ids.empty() || !c_t5_weights.empty()) { + !c_t5_ids.empty() || !c_t5_weights.empty() || + !c_input_ids.empty() || !c_position_ids.empty() || + !c_token_types.empty() || !c_image_embed_ranges.empty() || + !c_vinput_mask.empty()) { return false; } + for (const auto& tensor : c_vlm_images) { + if (!tensor.empty()) { + return false; + } + } + + for (const auto& tensor : c_ref_images) { + if (!tensor.empty()) { + return false; + } + } + for (const auto& tensor : extra_c_crossattns) { if (!tensor.empty()) { return false; diff --git a/src/denoiser.hpp b/src/denoiser.hpp index 831da2580..3f08706da 100644 --- a/src/denoiser.hpp +++ b/src/denoiser.hpp @@ -1289,8 +1289,8 @@ static sd::Tensor sample_res_multistep(denoise_cb_t model, } sd::Tensor denoised = std::move(denoised_opt); - float sigma_from = sigmas[i]; - float sigma_to = sigmas[i + 1]; + float sigma_from = sigmas[i]; + float sigma_to = sigmas[i + 1]; auto [sigma_down, sigma_up, alpha_scale] = get_ancestral_step(sigma_from, sigma_to, eta, is_flow_denoiser); diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp index 1a202a1a7..d0204baf6 100644 --- a/src/diffusion_model.hpp +++ b/src/diffusion_model.hpp @@ -5,6 +5,7 @@ #include "anima.hpp" #include "ernie_image.hpp" #include "flux.hpp" +#include "hidream_o1.hpp" #include "mmdit.hpp" #include "qwen_image.hpp" #include "tensor_ggml.hpp" @@ -22,6 +23,12 @@ struct DiffusionParams { const sd::Tensor* t5_weights = nullptr; const sd::Tensor* guidance = nullptr; const std::vector>* ref_latents = nullptr; + const sd::Tensor* input_ids = nullptr; + const sd::Tensor* input_pos = nullptr; + const sd::Tensor* token_types = nullptr; + const sd::Tensor* image_embed_ranges = nullptr; + const sd::Tensor* vinput_mask = nullptr; + const std::vector>* vlm_images = nullptr; bool increase_ref_index = false; int num_video_frames = -1; const std::vector>* controls = nullptr; @@ -476,6 +483,83 @@ struct QwenImageModel : public DiffusionModel { } }; +struct HiDreamO1Model : public DiffusionModel { + std::string prefix; + HiDreamO1::HiDreamO1Runner hidream_o1; + + HiDreamO1Model(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2TensorStorage& tensor_storage_map = {}, + const std::string& prefix = "model") + : prefix(prefix), hidream_o1(backend, offload_params_to_cpu, tensor_storage_map, prefix) { + } + + std::string get_desc() override { + return hidream_o1.get_desc(); + } + + void alloc_params_buffer() override { + hidream_o1.alloc_params_buffer(); + } + + void free_params_buffer() override { + hidream_o1.free_params_buffer(); + } + + void free_compute_buffer() override { + hidream_o1.free_compute_buffer(); + } + + void get_param_tensors(std::map& tensors) override { + hidream_o1.get_param_tensors(tensors, prefix); + } + + size_t get_params_buffer_size() override { + return hidream_o1.get_params_buffer_size(); + } + + void set_weight_adapter(const std::shared_ptr& adapter) override { + hidream_o1.set_weight_adapter(adapter); + } + + int64_t get_adm_in_channels() override { + return 0; + } + + void set_flash_attention_enabled(bool enabled) { + hidream_o1.set_flash_attention_enabled(enabled); + } + + void set_max_graph_vram_bytes(size_t max_vram_bytes) override { + hidream_o1.set_max_graph_vram_bytes(max_vram_bytes); + } + + void set_circular_axes(bool circular_x, bool circular_y) override { + hidream_o1.set_circular_axes(circular_x, circular_y); + } + + sd::Tensor compute(int n_threads, + const DiffusionParams& diffusion_params) override { + GGML_ASSERT(diffusion_params.x != nullptr); + GGML_ASSERT(diffusion_params.timesteps != nullptr); + GGML_ASSERT(diffusion_params.input_ids != nullptr); + GGML_ASSERT(diffusion_params.input_pos != nullptr); + GGML_ASSERT(diffusion_params.token_types != nullptr); + static const sd::Tensor empty_image_embed_ranges; + static const std::vector> empty_images; + return hidream_o1.compute(n_threads, + *diffusion_params.x, + *diffusion_params.timesteps, + *diffusion_params.input_ids, + *diffusion_params.input_pos, + *diffusion_params.token_types, + diffusion_params.image_embed_ranges ? *diffusion_params.image_embed_ranges : empty_image_embed_ranges, + diffusion_params.vinput_mask ? *diffusion_params.vinput_mask : empty_image_embed_ranges, + diffusion_params.vlm_images ? *diffusion_params.vlm_images : empty_images, + diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_images); + } +}; + struct ZImageModel : public DiffusionModel { std::string prefix; ZImage::ZImageRunner z_image; diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index 362303229..c1939bcc8 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -1698,13 +1698,41 @@ struct WeightAdapter { }; struct GGMLRunnerContext { - ggml_backend_t backend = nullptr; - ggml_context* ggml_ctx = nullptr; - bool flash_attn_enabled = false; - bool conv2d_direct_enabled = false; - bool circular_x_enabled = false; - bool circular_y_enabled = false; - std::shared_ptr weight_adapter = nullptr; + ggml_backend_t backend = nullptr; + ggml_context* ggml_ctx = nullptr; + bool flash_attn_enabled = false; + bool conv2d_direct_enabled = false; + bool circular_x_enabled = false; + bool circular_y_enabled = false; + std::shared_ptr weight_adapter = nullptr; + std::vector>* debug_tensors = nullptr; + std::function get_cache_tensor; + std::function cache_tensor; + + void capture_tensor(const std::string& name, ggml_tensor* tensor) { + if (debug_tensors == nullptr || tensor == nullptr) { + return; + } + ggml_tensor* snapshot = ggml_cont(ggml_ctx, tensor); + ggml_tensor* dst = ggml_dup_tensor(ggml_ctx, snapshot); + snapshot = ggml_cpy(ggml_ctx, snapshot, dst); + ggml_set_output(snapshot); + debug_tensors->push_back({snapshot, name}); + } + + ggml_tensor* load_cache_tensor(const std::string& name) const { + if (!get_cache_tensor) { + return nullptr; + } + return get_cache_tensor(name); + } + + void persist_cache_tensor(const std::string& name, ggml_tensor* tensor) const { + if (!cache_tensor || tensor == nullptr) { + return; + } + cache_tensor(name, tensor); + } }; struct GGMLRunner { @@ -1743,6 +1771,7 @@ struct GGMLRunner { std::map backend_tensor_data_map; std::map cache_tensor_map; // name -> tensor + std::vector> debug_tensors; const std::string final_result_name = "ggml_runner_final_result_tensor"; bool flash_attn_enabled = false; @@ -1838,6 +1867,7 @@ struct GGMLRunner { } void free_compute_ctx() { + debug_tensors.clear(); if (compute_ctx != nullptr) { ggml_free(compute_ctx); compute_ctx = nullptr; @@ -1884,6 +1914,16 @@ struct GGMLRunner { auto result = ggml_graph_node(gf, -1); ggml_set_name(result, final_result_name.c_str()); } + for (const auto& entry : debug_tensors) { + if (entry.first != nullptr) { + ggml_build_forward_expand(gf, entry.first); + } + } + for (const auto& entry : cache_tensor_map) { + if (entry.second != nullptr) { + ggml_build_forward_expand(gf, entry.second); + } + } prepare_build_in_tensor_after(gf); return gf; } @@ -2031,6 +2071,21 @@ struct GGMLRunner { for (auto& kv : backend_tensor_data_map) { auto tensor = kv.first; auto data = kv.second; + if (tensor == nullptr || data == nullptr) { + continue; + } + const char* name = ggml_get_name(tensor); + if (tensor->buffer == nullptr) { + LOG_WARN("%s skip backend tensor copy: tensor buffer not set, name='%s', ne=[%lld,%lld,%lld,%lld], type=%s", + get_desc().c_str(), + name != nullptr ? name : "", + (long long)tensor->ne[0], + (long long)tensor->ne[1], + (long long)tensor->ne[2], + (long long)tensor->ne[3], + ggml_type_name(tensor->type)); + continue; + } if (graph_tensor_set.find(tensor) == graph_tensor_set.end()) { continue; @@ -2421,6 +2476,22 @@ struct GGMLRunner { return std::nullopt; } + for (const auto& entry : debug_tensors) { + auto tensor = entry.first; + if (tensor == nullptr) { + continue; + } + if (tensor->type != GGML_TYPE_F32) { + LOG_WARN("%s skip debug tensor '%s': only GGML_TYPE_F32 is supported, got %s", + get_desc().c_str(), + entry.second.c_str(), + ggml_type_name(tensor->type)); + continue; + } + auto debug_tensor = sd::make_sd_tensor_from_ggml(tensor); + print_sd_tensor(debug_tensor, false, entry.second.c_str()); + } + int64_t t_cache_begin = ggml_time_ms(); if (!copy_cache_tensors_to_cache_buffer(cache_keep_names)) { if (free_compute_buffer_immediately) { @@ -2557,6 +2628,13 @@ struct GGMLRunner { runner_ctx.circular_x_enabled = circular_x_enabled; runner_ctx.circular_y_enabled = circular_y_enabled; runner_ctx.weight_adapter = weight_adapter; + runner_ctx.debug_tensors = &debug_tensors; + runner_ctx.get_cache_tensor = [this](const std::string& name) { + return this->get_cache_tensor_by_name(name); + }; + runner_ctx.cache_tensor = [this](const std::string& name, ggml_tensor* tensor) { + this->cache(name, tensor); + }; return runner_ctx; } @@ -2659,6 +2737,9 @@ struct GGMLRunner { } void cache(const std::string name, ggml_tensor* tensor) { + if (tensor != nullptr && tensor->view_src != nullptr) { + tensor = ggml_cont(compute_ctx, tensor); + } cache_tensor_map[name] = tensor; } diff --git a/src/hidream_o1.hpp b/src/hidream_o1.hpp new file mode 100644 index 000000000..76ad629fd --- /dev/null +++ b/src/hidream_o1.hpp @@ -0,0 +1,975 @@ +#ifndef __SD_HIDREAM_O1_H__ +#define __SD_HIDREAM_O1_H__ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common_dit.hpp" +#include "conditioner.hpp" +#include "llm.hpp" +#include "util.h" + +namespace HiDreamO1 { + constexpr int HIDREAM_O1_GRAPH_SIZE = 32768; + constexpr int PATCH_SIZE = 32; + constexpr int TIMESTEP_TOKEN_NUM = 1; + constexpr int IMAGE_TOKEN_ID = 151655; + constexpr int VISION_START_TOKEN_ID = 151652; + + static inline std::string repeat_special_token(const std::string& token, int64_t count) { + std::string out; + out.reserve(static_cast(count) * token.size()); + for (int64_t i = 0; i < count; ++i) { + out += token; + } + return out; + } + + static inline std::pair calculate_dimensions(int max_size, double ratio) { + int width = static_cast(std::sqrt(max_size * max_size * ratio)); + int height = static_cast(width / ratio); + width = (width / PATCH_SIZE) * PATCH_SIZE; + height = (height / PATCH_SIZE) * PATCH_SIZE; + width = std::max(width, PATCH_SIZE); + height = std::max(height, PATCH_SIZE); + return {width, height}; + } + + static inline sd::Tensor resize_to_area(const sd::Tensor& image, int image_size) { + int64_t width = image.shape()[0]; + int64_t height = image.shape()[1]; + int64_t s_max = static_cast(image_size) * image_size; + double scale = std::sqrt(static_cast(s_max) / static_cast(width * height)); + + std::vector> sizes = { + {(static_cast(std::llround(width * scale)) / PATCH_SIZE) * PATCH_SIZE, (static_cast(std::llround(height * scale)) / PATCH_SIZE) * PATCH_SIZE}, + {(static_cast(std::llround(width * scale)) / PATCH_SIZE) * PATCH_SIZE, (static_cast(std::floor(height * scale)) / PATCH_SIZE) * PATCH_SIZE}, + {(static_cast(std::floor(width * scale)) / PATCH_SIZE) * PATCH_SIZE, (static_cast(std::llround(height * scale)) / PATCH_SIZE) * PATCH_SIZE}, + {(static_cast(std::floor(width * scale)) / PATCH_SIZE) * PATCH_SIZE, (static_cast(std::floor(height * scale)) / PATCH_SIZE) * PATCH_SIZE}, + }; + std::sort(sizes.begin(), sizes.end(), [](const auto& a, const auto& b) { + return a.first * a.second > b.first * b.second; + }); + + std::pair new_size = sizes.back(); + for (const auto& size : sizes) { + if (size.first > 0 && size.second > 0 && size.first * size.second <= s_max) { + new_size = size; + break; + } + } + + double s1 = static_cast(width) / static_cast(new_size.first); + double s2 = static_cast(height) / static_cast(new_size.second); + sd::Tensor resized; + if (s1 < s2) { + int64_t resized_h = static_cast(std::llround(height / s1)); + resized = sd::ops::interpolate(image, {new_size.first, resized_h, image.shape()[2], image.shape()[3]}); + int64_t top = (resized_h - new_size.second) / 2; + resized = sd::ops::slice(resized, 1, top, top + new_size.second); + } else { + int64_t resized_w = static_cast(std::llround(width / s2)); + resized = sd::ops::interpolate(image, {resized_w, new_size.second, image.shape()[2], image.shape()[3]}); + int64_t left = (resized_w - new_size.first) / 2; + resized = sd::ops::slice(resized, 0, left, left + new_size.first); + } + return resized; + } + + static inline std::vector build_position_ids(const std::vector& input_ids, + const std::vector>& image_grids, + const std::vector& skip_vision_start_token) { + std::vector position_ids(4 * input_ids.size(), 0); + int image_index = 0; + int st = 0; + int fix_point = 4096; + std::vector out_t; + std::vector out_h; + std::vector out_w; + + while (st < static_cast(input_ids.size())) { + int ed = st; + while (ed < static_cast(input_ids.size()) && input_ids[ed] != IMAGE_TOKEN_ID) { + ed++; + } + + if (ed >= static_cast(input_ids.size())) { + int st_idx = out_t.empty() ? 0 : (*std::max_element(out_t.begin(), out_t.end()) + 1); + for (int i = 0; i < static_cast(input_ids.size()) - st; ++i) { + out_t.push_back(st_idx + i); + out_h.push_back(st_idx + i); + out_w.push_back(st_idx + i); + } + break; + } + + int text_len = std::max(0, ed - st - skip_vision_start_token[image_index]); + int st_idx = out_t.empty() ? 0 : (*std::max_element(out_t.begin(), out_t.end()) + 1); + for (int i = 0; i < text_len; ++i) { + out_t.push_back(st_idx + i); + out_h.push_back(st_idx + i); + out_w.push_back(st_idx + i); + } + + auto grid = image_grids[image_index]; + int base; + if (skip_vision_start_token[image_index]) { + if (fix_point > 0) { + base = fix_point; + fix_point = 0; + } else { + base = st_idx; + } + } else { + base = text_len + st_idx; + } + for (int32_t ti = 0; ti < grid[0]; ++ti) { + for (int32_t hi = 0; hi < grid[1]; ++hi) { + for (int32_t wi = 0; wi < grid[2]; ++wi) { + out_t.push_back(base + ti); + out_h.push_back(base + hi); + out_w.push_back(base + wi); + } + } + } + + st = ed + grid[0] * grid[1] * grid[2]; + image_index++; + } + + GGML_ASSERT(out_t.size() == input_ids.size()); + for (size_t i = 0; i < input_ids.size(); ++i) { + // ggml IMROPE consumes 4 flattened position streams: + // [t, h, w, e] + // llama.cpp's generic Qwen-VL fallback expands text positions as + // [pos, pos, pos, 0]. Keep the extra stream zeroed here too. + position_ids[i] = out_t[i]; + position_ids[input_ids.size() + i] = out_h[i]; + position_ids[input_ids.size() * 2 + i] = out_w[i]; + position_ids[input_ids.size() * 3 + i] = 0; + } + return position_ids; + } + + struct TimestepEmbedder : public GGMLBlock { + int frequency_embedding_size = 256; + + TimestepEmbedder(int64_t hidden_size) { + blocks["mlp.0"] = std::make_shared(frequency_embedding_size, hidden_size, true); + blocks["mlp.2"] = std::make_shared(hidden_size, hidden_size, true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* t) { + auto mlp_0 = std::dynamic_pointer_cast(blocks["mlp.0"]); + auto mlp_2 = std::dynamic_pointer_cast(blocks["mlp.2"]); + auto emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, t, frequency_embedding_size, 10000, 1000.0f); + emb = mlp_0->forward(ctx, emb); + emb = ggml_silu_inplace(ctx->ggml_ctx, emb); + emb = mlp_2->forward(ctx, emb); + return emb; + } + }; + + struct BottleneckPatchEmbed : public GGMLBlock { + BottleneckPatchEmbed(int64_t in_dim, int64_t pca_dim, int64_t embed_dim) { + blocks["proj1"] = std::make_shared(in_dim, pca_dim, false); + blocks["proj2"] = std::make_shared(pca_dim, embed_dim, true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + auto proj1 = std::dynamic_pointer_cast(blocks["proj1"]); + auto proj2 = std::dynamic_pointer_cast(blocks["proj2"]); + return proj2->forward(ctx, proj1->forward(ctx, x)); + } + }; + + struct FinalLayer : public GGMLBlock { + FinalLayer(int64_t hidden_size, int64_t out_dim) { + blocks["linear"] = std::make_shared(hidden_size, out_dim, true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + auto linear = std::dynamic_pointer_cast(blocks["linear"]); + return linear->forward(ctx, x); + } + }; + + struct HiDreamO1Params { + LLM::LLMParams llm; + int patch_size = PATCH_SIZE; + int num_position_embeddings = 2304; + std::vector deepstack_visual_indexes; + }; + + struct VisionMLP : public GGMLBlock { + VisionMLP(int64_t hidden_size, int64_t intermediate_size) { + blocks["linear_fc1"] = std::make_shared(hidden_size, intermediate_size, true); + blocks["linear_fc2"] = std::make_shared(intermediate_size, hidden_size, true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + auto linear_fc1 = std::dynamic_pointer_cast(blocks["linear_fc1"]); + auto linear_fc2 = std::dynamic_pointer_cast(blocks["linear_fc2"]); + + x = linear_fc1->forward(ctx, x); + x = ggml_ext_gelu(ctx->ggml_ctx, x); + x = linear_fc2->forward(ctx, x); + return x; + } + }; + + struct VisionPatchEmbed : public GGMLBlock { + int patch_size; + int temporal_patch_size; + int64_t in_channels; + int64_t embed_dim; + + VisionPatchEmbed(int patch_size, + int temporal_patch_size, + int64_t in_channels, + int64_t embed_dim) + : patch_size(patch_size), + temporal_patch_size(temporal_patch_size), + in_channels(in_channels), + embed_dim(embed_dim) { + blocks["proj"] = std::make_shared(in_channels, + embed_dim, + std::tuple{temporal_patch_size, patch_size, patch_size}, + std::tuple{temporal_patch_size, patch_size, patch_size}, + std::tuple{0, 0, 0}, + std::tuple{1, 1, 1}, + true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + auto proj = std::dynamic_pointer_cast(blocks["proj"]); + x = ggml_reshape_4d(ctx->ggml_ctx, + x, + patch_size, + patch_size, + temporal_patch_size, + ggml_nelements(x) / (temporal_patch_size * patch_size * patch_size)); + x = proj->forward(ctx, x); + x = ggml_reshape_2d(ctx->ggml_ctx, x, embed_dim, ggml_nelements(x) / embed_dim); + return x; + } + }; + + struct VisionPatchMerger : public GGMLBlock { + int64_t hidden_size; + bool use_postshuffle_norm; + + VisionPatchMerger(int64_t dim, + int64_t context_dim, + int spatial_merge_size, + bool use_postshuffle_norm) + : hidden_size(context_dim * spatial_merge_size * spatial_merge_size), + use_postshuffle_norm(use_postshuffle_norm) { + blocks["norm"] = std::make_shared(use_postshuffle_norm ? hidden_size : context_dim, 1e-6f); + blocks["linear_fc1"] = std::make_shared(hidden_size, hidden_size, true); + blocks["linear_fc2"] = std::make_shared(hidden_size, dim, true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + auto norm = std::dynamic_pointer_cast(blocks["norm"]); + auto linear_fc1 = std::dynamic_pointer_cast(blocks["linear_fc1"]); + auto linear_fc2 = std::dynamic_pointer_cast(blocks["linear_fc2"]); + + x = norm->forward(ctx, x); + x = ggml_reshape_2d(ctx->ggml_ctx, x, hidden_size, ggml_nelements(x) / hidden_size); + x = linear_fc1->forward(ctx, x); + x = ggml_ext_gelu(ctx->ggml_ctx, x); + x = linear_fc2->forward(ctx, x); + return x; + } + }; + + struct VisionAttention : public GGMLBlock { + int head_dim; + int num_heads; + + VisionAttention(int64_t hidden_size, int num_heads) + : num_heads(num_heads) { + head_dim = static_cast(hidden_size / num_heads); + GGML_ASSERT(num_heads * head_dim == hidden_size); + blocks["qkv"] = std::make_shared(hidden_size, hidden_size * 3, true); + blocks["proj"] = std::make_shared(hidden_size, hidden_size, true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* pe) { + auto qkv_proj = std::dynamic_pointer_cast(blocks["qkv"]); + auto proj = std::dynamic_pointer_cast(blocks["proj"]); + + auto qkv = qkv_proj->forward(ctx, x); + auto qkv_vec = split_qkv(ctx->ggml_ctx, qkv); + + auto q = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]); + auto k = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]); + auto v = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[2], head_dim, num_heads, qkv_vec[2]->ne[1], qkv_vec[2]->ne[2]); + + x = Rope::attention(ctx, q, k, v, pe, nullptr, 1.f, false); + x = proj->forward(ctx, x); + return x; + } + }; + + struct VisionBlock : public GGMLBlock { + VisionBlock(int64_t hidden_size, + int64_t intermediate_size, + int num_heads) { + blocks["norm1"] = std::make_shared(hidden_size, 1e-6f); + blocks["norm2"] = std::make_shared(hidden_size, 1e-6f); + blocks["attn"] = std::make_shared(hidden_size, num_heads); + blocks["mlp"] = std::make_shared(hidden_size, intermediate_size); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* pe) { + auto norm1 = std::dynamic_pointer_cast(blocks["norm1"]); + auto norm2 = std::dynamic_pointer_cast(blocks["norm2"]); + auto attn = std::dynamic_pointer_cast(blocks["attn"]); + auto mlp = std::dynamic_pointer_cast(blocks["mlp"]); + + auto residual = x; + x = norm1->forward(ctx, x); + x = attn->forward(ctx, x, pe); + x = ggml_add_inplace(ctx->ggml_ctx, x, residual); + + residual = x; + x = norm2->forward(ctx, x); + x = mlp->forward(ctx, x); + x = ggml_add_inplace(ctx->ggml_ctx, x, residual); + return x; + } + }; + + struct VisionOutput { + ggml_tensor* hidden_states = nullptr; + std::vector deepstack_hidden_states; + }; + + struct VisionModel : public GGMLBlock { + int num_layers; + int spatial_merge_size; + int num_grid_per_side; + std::vector deepstack_visual_indexes; + + VisionModel(int num_layers, + int64_t in_channels, + int64_t hidden_size, + int64_t out_hidden_size, + int64_t intermediate_size, + int num_heads, + int spatial_merge_size, + int patch_size, + int temporal_patch_size, + int num_position_embeddings, + std::vector deepstack_visual_indexes) + : num_layers(num_layers), + spatial_merge_size(spatial_merge_size), + num_grid_per_side(static_cast(std::sqrt(num_position_embeddings))), + deepstack_visual_indexes(std::move(deepstack_visual_indexes)) { + blocks["patch_embed"] = std::make_shared(patch_size, + temporal_patch_size, + in_channels, + hidden_size); + blocks["pos_embed"] = std::make_shared(num_position_embeddings, hidden_size); + for (int i = 0; i < num_layers; ++i) { + blocks["blocks." + std::to_string(i)] = std::make_shared(hidden_size, + intermediate_size, + num_heads); + } + blocks["merger"] = std::make_shared(out_hidden_size, + hidden_size, + spatial_merge_size, + false); + for (int i = 0; i < static_cast(this->deepstack_visual_indexes.size()); ++i) { + blocks["deepstack_merger_list." + std::to_string(i)] = std::make_shared(out_hidden_size, + hidden_size, + spatial_merge_size, + true); + } + } + + ggml_tensor* fast_pos_embed_interpolate(GGMLRunnerContext* ctx, + int grid_h, + int grid_w) { + auto pos_embed = std::dynamic_pointer_cast(blocks["pos_embed"]); + std::vector idx_list[4]; + std::vector weight_list[4]; + idx_list[0].reserve(static_cast(grid_h * grid_w)); + idx_list[1].reserve(static_cast(grid_h * grid_w)); + idx_list[2].reserve(static_cast(grid_h * grid_w)); + idx_list[3].reserve(static_cast(grid_h * grid_w)); + weight_list[0].reserve(static_cast(grid_h * grid_w)); + weight_list[1].reserve(static_cast(grid_h * grid_w)); + weight_list[2].reserve(static_cast(grid_h * grid_w)); + weight_list[3].reserve(static_cast(grid_h * grid_w)); + + double max_index = static_cast(num_grid_per_side - 1); + for (int h = 0; h < grid_h; ++h) { + double h_pos = grid_h == 1 ? 0.0 : max_index * h / static_cast(grid_h - 1); + int h_floor = static_cast(std::floor(h_pos)); + int h_ceil = std::min(h_floor + 1, num_grid_per_side - 1); + double dh = h_pos - h_floor; + for (int w = 0; w < grid_w; ++w) { + double w_pos = grid_w == 1 ? 0.0 : max_index * w / static_cast(grid_w - 1); + int w_floor = static_cast(std::floor(w_pos)); + int w_ceil = std::min(w_floor + 1, num_grid_per_side - 1); + double dw = w_pos - w_floor; + + idx_list[0].push_back(h_floor * num_grid_per_side + w_floor); + idx_list[1].push_back(h_floor * num_grid_per_side + w_ceil); + idx_list[2].push_back(h_ceil * num_grid_per_side + w_floor); + idx_list[3].push_back(h_ceil * num_grid_per_side + w_ceil); + + weight_list[0].push_back(static_cast((1.0 - dh) * (1.0 - dw))); + weight_list[1].push_back(static_cast((1.0 - dh) * dw)); + weight_list[2].push_back(static_cast(dh * (1.0 - dw))); + weight_list[3].push_back(static_cast(dh * dw)); + } + } + + ggml_tensor* patch_pos_embeds = nullptr; + for (int i = 0; i < 4; ++i) { + auto idx_tensor = ggml_new_tensor_1d(ctx->ggml_ctx, GGML_TYPE_I32, static_cast(idx_list[i].size())); + std::memcpy(idx_tensor->data, idx_list[i].data(), idx_list[i].size() * sizeof(int32_t)); + auto embed = pos_embed->forward(ctx, idx_tensor); + auto weight_tensor = ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, 1, static_cast(weight_list[i].size())); + std::memcpy(weight_tensor->data, weight_list[i].data(), weight_list[i].size() * sizeof(float)); + embed = ggml_mul(ctx->ggml_ctx, embed, weight_tensor); + patch_pos_embeds = patch_pos_embeds == nullptr ? embed : ggml_add(ctx->ggml_ctx, patch_pos_embeds, embed); + } + + patch_pos_embeds = ggml_reshape_4d(ctx->ggml_ctx, + patch_pos_embeds, + patch_pos_embeds->ne[0], + spatial_merge_size, + grid_w / spatial_merge_size, + grid_h * spatial_merge_size); + patch_pos_embeds = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, patch_pos_embeds, 0, 1, 3, 2)); + patch_pos_embeds = ggml_reshape_2d(ctx->ggml_ctx, + patch_pos_embeds, + patch_pos_embeds->ne[0], + ggml_nelements(patch_pos_embeds) / patch_pos_embeds->ne[0]); + return patch_pos_embeds; + } + + VisionOutput forward(GGMLRunnerContext* ctx, + ggml_tensor* pixel_values, + ggml_tensor* pe, + int grid_h, + int grid_w) { + auto patch_embed = std::dynamic_pointer_cast(blocks["patch_embed"]); + auto merger = std::dynamic_pointer_cast(blocks["merger"]); + + auto x = patch_embed->forward(ctx, pixel_values); + auto pos_embeds = fast_pos_embed_interpolate(ctx, grid_h, grid_w); + x = ggml_add(ctx->ggml_ctx, x, pos_embeds); + x = ggml_reshape_3d(ctx->ggml_ctx, x, x->ne[0], x->ne[1], 1); + + VisionOutput out; + for (int i = 0; i < num_layers; ++i) { + auto block = std::dynamic_pointer_cast(blocks["blocks." + std::to_string(i)]); + x = block->forward(ctx, x, pe); + for (int j = 0; j < static_cast(deepstack_visual_indexes.size()); ++j) { + if (deepstack_visual_indexes[j] == i) { + auto deepstack_merger = std::dynamic_pointer_cast(blocks["deepstack_merger_list." + std::to_string(j)]); + out.deepstack_hidden_states.push_back(deepstack_merger->forward(ctx, x)); + break; + } + } + } + + out.hidden_states = merger->forward(ctx, x); + return out; + } + }; + + struct HiDreamO1Model : public GGMLBlock { + HiDreamO1Params params; + + HiDreamO1Model() = default; + explicit HiDreamO1Model(HiDreamO1Params params) + : params(std::move(params)) { + blocks["language_model"] = std::make_shared(this->params.llm); + blocks["visual"] = std::make_shared(this->params.llm.vision.num_layers, + this->params.llm.vision.in_channels, + this->params.llm.vision.hidden_size, + this->params.llm.vision.out_hidden_size, + this->params.llm.vision.intermediate_size, + this->params.llm.vision.num_heads, + this->params.llm.vision.spatial_merge_size, + this->params.llm.vision.patch_size, + this->params.llm.vision.temporal_patch_size, + this->params.num_position_embeddings, + this->params.deepstack_visual_indexes); + blocks["t_embedder1"] = std::make_shared(this->params.llm.hidden_size); + blocks["x_embedder"] = std::make_shared(this->params.patch_size * this->params.patch_size * 3, + this->params.llm.hidden_size / 4, + this->params.llm.hidden_size); + blocks["final_layer2"] = std::make_shared(this->params.llm.hidden_size, + this->params.patch_size * this->params.patch_size * 3); + } + + std::shared_ptr text_model() { + return std::dynamic_pointer_cast(blocks["language_model"]); + } + + std::shared_ptr vision_model() { + return std::dynamic_pointer_cast(blocks["visual"]); + } + + std::shared_ptr timestep_embedder() { + return std::dynamic_pointer_cast(blocks["t_embedder1"]); + } + + std::shared_ptr patch_embedder() { + return std::dynamic_pointer_cast(blocks["x_embedder"]); + } + + std::shared_ptr final_layer() { + return std::dynamic_pointer_cast(blocks["final_layer2"]); + } + }; + + struct HiDreamO1Runner : public GGMLRunner { + HiDreamO1Params params; + HiDreamO1Model model; + + std::vector window_index_vec; + std::vector window_inverse_index_vec; + std::vector window_mask_vec; + std::vector pe_vec; + std::vector attention_mask_vec; + + HiDreamO1Runner(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2TensorStorage& tensor_storage_map = {}, + const std::string& prefix = "model") + : GGMLRunner(backend, offload_params_to_cpu) { + params.llm.arch = LLM::LLMArch::QWEN3_VL; + params.llm.hidden_size = 4096; + params.llm.intermediate_size = 12288; + params.llm.num_layers = 36; + params.llm.num_heads = 32; + params.llm.num_kv_heads = 8; + params.llm.head_dim = 128; + params.llm.qkv_bias = false; + params.llm.qk_norm = true; + params.llm.vocab_size = 151936; + params.llm.rms_norm_eps = 1e-6f; + params.llm.vision.num_layers = 27; + params.llm.vision.hidden_size = 1152; + params.llm.vision.intermediate_size = 4304; + params.llm.vision.num_heads = 16; + params.llm.vision.out_hidden_size = 4096; + params.llm.vision.patch_size = 16; + params.llm.vision.spatial_merge_size = 2; + params.llm.vision.temporal_patch_size = 2; + params.num_position_embeddings = 2304; + params.deepstack_visual_indexes = {8, 16, 24}; + + model = HiDreamO1Model(params); + model.init(params_ctx, tensor_storage_map, prefix); + } + + std::string get_desc() override { + return "hidream_o1"; + } + + void get_param_tensors(std::map& tensors, const std::string& prefix) { + model.get_param_tensors(tensors, prefix); + } + + ggml_tensor* process_image(ggml_context* ctx, ggml_tensor* image) { + int64_t C = image->ne[2]; + int64_t H = image->ne[1]; + int64_t W = image->ne[0]; + int64_t mh = params.llm.vision.spatial_merge_size; + int64_t mw = params.llm.vision.spatial_merge_size; + int64_t pt = params.llm.vision.temporal_patch_size; + int64_t ph = params.llm.vision.patch_size; + int64_t pw = params.llm.vision.patch_size; + + image = ggml_reshape_4d(ctx, image, pw, mw, (W / mw / pw), H * C); + image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 3, 1)); + image = ggml_reshape_4d(ctx, image, pw * (W / mw / pw), H, C, mw); + image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 3, 1)); + image = ggml_reshape_4d(ctx, image, pw, (W / mw / pw) * C * mw, ph, mh * (H / mh / ph)); + image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 1, 3)); + image = ggml_reshape_4d(ctx, image, pw * ph, (W / mw / pw), C, mw * mh * (H / mh / ph)); + image = ggml_concat(ctx, image, image, 0); + image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 1, 3)); + image = ggml_reshape_4d(ctx, image, pw * ph * pt * C, (W / mw / pw), mw * mh, (H / mh / ph)); + image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 1, 3)); + image = ggml_reshape_2d(ctx, image, pw * ph * pt * C, mw * mh * (W / mw / pw) * (H / mh / ph)); + return image; + } + + ggml_tensor* concat_seq(GGMLRunnerContext* ctx, ggml_tensor* a, ggml_tensor* b) { + if (a == nullptr) { + return b; + } + if (b == nullptr) { + return a; + } + return ggml_concat(ctx->ggml_ctx, a, b, 1); + } + + ggml_tensor* scatter_visual_embeds(GGMLRunnerContext* ctx, + ggml_tensor* inputs_embeds, + const sd::Tensor& image_embed_ranges_tensor, + ggml_tensor* visual_embeds) { + if (visual_embeds == nullptr || image_embed_ranges_tensor.empty()) { + return inputs_embeds; + } + + ggml_tensor* output = nullptr; + int prev_end = 0; + int n_ranges = static_cast(image_embed_ranges_tensor.shape()[1]); + int visual_offset = 0; + for (int i = 0; i < n_ranges; ++i) { + int start = image_embed_ranges_tensor.values()[i * 2]; + int len = image_embed_ranges_tensor.values()[i * 2 + 1]; + + if (start > prev_end) { + output = concat_seq(ctx, output, ggml_ext_slice(ctx->ggml_ctx, inputs_embeds, 1, prev_end, start)); + } + + output = concat_seq(ctx, + output, + ggml_ext_slice(ctx->ggml_ctx, visual_embeds, 1, visual_offset, visual_offset + len)); + prev_end = start + len; + visual_offset += len; + } + + if (prev_end < inputs_embeds->ne[1]) { + output = concat_seq(ctx, output, ggml_ext_slice(ctx->ggml_ctx, inputs_embeds, 1, prev_end, inputs_embeds->ne[1])); + } + return output == nullptr ? inputs_embeds : output; + } + + VisionOutput encode_image(GGMLRunnerContext* runner_ctx, ggml_tensor* image) { + auto vision = model.vision_model(); + GGML_ASSERT(image->ne[1] % (params.llm.vision.patch_size * params.llm.vision.spatial_merge_size) == 0); + GGML_ASSERT(image->ne[0] % (params.llm.vision.patch_size * params.llm.vision.spatial_merge_size) == 0); + + int grid_h = static_cast(image->ne[1]) / params.llm.vision.patch_size; + int grid_w = static_cast(image->ne[0]) / params.llm.vision.patch_size; + + auto pixel_values = process_image(compute_ctx, image); + + int head_dim = static_cast(params.llm.vision.hidden_size / params.llm.vision.num_heads); + std::vector window_index_vec(static_cast((grid_h / params.llm.vision.spatial_merge_size) * (grid_w / params.llm.vision.spatial_merge_size))); + for (int i = 0; i < static_cast(window_index_vec.size()); ++i) { + window_index_vec[static_cast(i)] = i; + } + pe_vec = Rope::gen_qwen2vl_pe(grid_h, grid_w, params.llm.vision.spatial_merge_size, window_index_vec, 10000, {head_dim / 2, head_dim / 2}); + int pos_len = static_cast(pe_vec.size() / head_dim / 2); + auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, head_dim / 2, pos_len); + set_backend_tensor_data(pe, pe_vec.data()); + + return vision->forward(runner_ctx, pixel_values, pe, grid_h, grid_w); + } + + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, + const sd::Tensor& timestep_tensor, + const sd::Tensor& input_ids_tensor, + const sd::Tensor& input_pos_tensor, + const sd::Tensor& token_types_tensor, + const sd::Tensor& image_embed_ranges_tensor, + const sd::Tensor& vinput_mask_tensor, + const std::vector>& vlm_images, + const std::vector>& ref_images) { + ggml_cgraph* gf = new_graph_custom(HIDREAM_O1_GRAPH_SIZE); + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* timestep = make_input(timestep_tensor); + ggml_tensor* input_ids = make_input(input_ids_tensor); + ggml_tensor* input_pos = make_input(input_pos_tensor); + + auto text_model = model.text_model(); + auto t_embedder1 = model.timestep_embedder(); + auto x_embedder = model.patch_embedder(); + auto final_layer2 = model.final_layer(); + + std::vector vlm_image_tensors; + for (const auto& image : vlm_images) { + vlm_image_tensors.push_back(make_input(image)); + } + + std::vector ref_image_tensors; + for (const auto& image : ref_images) { + ref_image_tensors.push_back(make_input(image)); + } + + attention_mask_vec = std::vector(static_cast(token_types_tensor.shape()[0] * token_types_tensor.shape()[0]), 0.0f); + int64_t total_seq_len = token_types_tensor.shape()[0]; + for (int64_t query = 0; query < total_seq_len; ++query) { + bool is_gen = token_types_tensor.values()[static_cast(query)] > 0; + for (int64_t key = 0; key < total_seq_len; ++key) { + if (!is_gen && key > query) { + attention_mask_vec[static_cast(query * total_seq_len + key)] = -INFINITY; + } + } + } + auto attention_mask = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, total_seq_len, total_seq_len); + set_backend_tensor_data(attention_mask, attention_mask_vec.data()); + + auto runner_ctx = get_context(); + ggml_tensor* visual_embeds = nullptr; + for (size_t i = 0; i < vlm_image_tensors.size(); ++i) { + auto image_output = encode_image(&runner_ctx, vlm_image_tensors[i]); + visual_embeds = visual_embeds == nullptr ? image_output.hidden_states : ggml_concat(compute_ctx, visual_embeds, image_output.hidden_states, 1); + } + + auto txt = text_model->embed(&runner_ctx, input_ids); + txt = scatter_visual_embeds(&runner_ctx, txt, image_embed_ranges_tensor, visual_embeds); + + auto t_emb = t_embedder1->forward(&runner_ctx, timestep); + int64_t txt_seq_len = input_ids->ne[0]; + if (txt_seq_len > 1) { + auto prefix = ggml_ext_slice(compute_ctx, txt, 1, 0, txt_seq_len - 1); + txt = ggml_concat(compute_ctx, prefix, ggml_reshape_3d(compute_ctx, t_emb, t_emb->ne[0], 1, 1), 1); + } else { + txt = ggml_reshape_3d(compute_ctx, t_emb, t_emb->ne[0], 1, 1); + } + + auto vinputs = DiT::pad_and_patchify(&runner_ctx, x, PATCH_SIZE, PATCH_SIZE); + int64_t target_tokens = vinputs->ne[1]; + for (ggml_tensor* ref_image : ref_image_tensors) { + auto ref = DiT::pad_and_patchify(&runner_ctx, ref_image, PATCH_SIZE, PATCH_SIZE); + vinputs = ggml_concat(compute_ctx, vinputs, ref, 1); + } + auto vis = x_embedder->forward(&runner_ctx, vinputs); + + auto inputs_embeds = ggml_concat(compute_ctx, txt, vis, 1); + auto hidden_states = text_model->forward_embeds(&runner_ctx, inputs_embeds, input_pos, attention_mask, {}); + auto x_pred_all = final_layer2->forward(&runner_ctx, hidden_states); + + int64_t x_pred_start = txt_seq_len; + if (!vinput_mask_tensor.empty()) { + int64_t seq_len = static_cast(vinput_mask_tensor.shape()[0]); + int64_t first_vinput = 0; + while (first_vinput < seq_len && vinput_mask_tensor.values()[static_cast(first_vinput)] == 0) { + first_vinput++; + } + x_pred_start = first_vinput; + } + auto x_pred = ggml_view_3d(compute_ctx, + x_pred_all, + x_pred_all->ne[0], + target_tokens, + x_pred_all->ne[2], + x_pred_all->nb[1], + x_pred_all->nb[2], + x_pred_start * x_pred_all->nb[1]); + x_pred = ggml_cont(compute_ctx, x_pred); + x_pred = DiT::unpatchify_and_crop(compute_ctx, x_pred, x->ne[1], x->ne[0], PATCH_SIZE, PATCH_SIZE); + + float sigma = 1.0f - timestep_tensor.values()[0]; + sigma = std::max(1e-6f, sigma); + auto out = ggml_scale(compute_ctx, ggml_sub(compute_ctx, x, x_pred), 1.0f / sigma); + + ggml_build_forward_expand(gf, out); + return gf; + } + + sd::Tensor compute(int n_threads, + const sd::Tensor& x, + const sd::Tensor& timestep, + const sd::Tensor& input_ids, + const sd::Tensor& input_pos, + const sd::Tensor& token_types, + const sd::Tensor& image_embed_ranges, + const sd::Tensor& vinput_mask, + const std::vector>& vlm_images, + const std::vector>& ref_images) { + auto get_graph = [&]() { + return build_graph(x, timestep, input_ids, input_pos, token_types, image_embed_ranges, vinput_mask, vlm_images, ref_images); + }; + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); + } + }; + + struct HiDreamO1Conditioner : public Conditioner { + Qwen2Tokenizer tokenizer; + + void get_param_tensors(std::map& tensors) override { + SD_UNUSED(tensors); + } + + void alloc_params_buffer() override {} + void free_params_buffer() override {} + size_t get_params_buffer_size() override { return 0; } + void set_flash_attention_enabled(bool enabled) override { SD_UNUSED(enabled); } + + SDCondition get_learned_condition(int n_threads, + const ConditionerParams& conditioner_params) override { + SD_UNUSED(n_threads); + SDCondition result; + + int width = conditioner_params.width; + int height = conditioner_params.height; + int64_t target_image_len = static_cast(width / PATCH_SIZE) * static_cast(height / PATCH_SIZE); + + std::vector> ref_images; + if (conditioner_params.ref_images != nullptr) { + ref_images = *conditioner_params.ref_images; + } + + std::vector> vlm_images; + std::vector> image_grids; + std::vector skip_vision_start; + + std::string prompt = "<|im_start|>user\n"; + std::vector image_ranges; + + if (ref_images.empty()) { + prompt += conditioner_params.text; + prompt += "<|im_end|>\n<|im_start|>assistant\n<|boi_token|><|tms_token|>"; + auto input_ids = tokenizer.encode(prompt, nullptr); + + std::vector input_ids_pad = input_ids; + input_ids_pad.push_back(VISION_START_TOKEN_ID); + input_ids_pad.insert(input_ids_pad.end(), target_image_len - 1, IMAGE_TOKEN_ID); + + image_grids.push_back({1, static_cast(height / PATCH_SIZE), static_cast(width / PATCH_SIZE)}); + skip_vision_start.push_back(1); + + std::vector token_types(input_ids_pad.size(), 0); + int txt_seq_len = static_cast(input_ids.size()); + int bgn = txt_seq_len - TIMESTEP_TOKEN_NUM; + for (int i = bgn; i < bgn + target_image_len + TIMESTEP_TOKEN_NUM; ++i) { + token_types[i] = 1; + } + for (int i = txt_seq_len - TIMESTEP_TOKEN_NUM; i < txt_seq_len; ++i) { + token_types[i] = 3; + } + + auto position_ids = build_position_ids(input_ids_pad, image_grids, skip_vision_start); + + std::vector input_shape{static_cast(input_ids.size())}; + std::vector position_shape{static_cast(input_ids_pad.size() * 4)}; + std::vector token_type_shape{static_cast(token_types.size())}; + std::vector vinput_mask(token_types.size(), 0); + for (int64_t i = txt_seq_len; i < static_cast(vinput_mask.size()); ++i) { + vinput_mask[static_cast(i)] = 1; + } + std::vector vinput_mask_shape{static_cast(vinput_mask.size())}; + + result.c_input_ids = sd::Tensor(input_shape, std::move(input_ids)); + result.c_position_ids = sd::Tensor(position_shape, position_ids); + result.c_token_types = sd::Tensor(token_type_shape, std::move(token_types)); + result.c_vinput_mask = sd::Tensor(vinput_mask_shape, std::move(vinput_mask)); + result.c_image_embed_ranges = sd::Tensor(); + return result; + } + + int K = static_cast(ref_images.size()); + int max_size; + if (K == 1) { + max_size = std::max(height, width); + } else if (K == 2) { + max_size = std::max(height, width) * 48 / 64; + } else if (K <= 4) { + max_size = std::max(height, width) / 2; + } else if (K <= 8) { + max_size = std::max(height, width) * 24 / 64; + } else { + max_size = std::max(height, width) / 4; + } + + int cond_img_size; + if (K <= 4) { + cond_img_size = 384; + } else if (K <= 8) { + cond_img_size = 384 * 48 / 64; + } else { + cond_img_size = 384 / 2; + } + + for (const auto& ref_image : ref_images) { + auto patch_img = resize_to_area(ref_image, max_size); + patch_img = sd::ops::clamp(patch_img, 0.0f, 1.0f); + patch_img = patch_img * 2.0f - 1.0f; + result.c_ref_images.push_back(std::move(patch_img)); + + auto dims = calculate_dimensions(cond_img_size, static_cast(ref_image.shape()[0]) / static_cast(ref_image.shape()[1])); + auto vlm_image = clip_preprocess(ref_image, dims.first, dims.second); + int64_t image_tokens = static_cast(dims.first / PATCH_SIZE) * static_cast(dims.second / PATCH_SIZE); + int64_t prompt_start = static_cast(tokenizer.encode(prompt + "<|vision_start|>", nullptr).size()); + prompt += "<|vision_start|>"; + prompt += repeat_special_token("<|image_pad|>", image_tokens); + prompt += "<|vision_end|>"; + image_ranges.push_back(static_cast(prompt_start)); + image_ranges.push_back(static_cast(image_tokens)); + result.c_vlm_images.push_back(std::move(vlm_image)); + image_grids.push_back({1, dims.second / PATCH_SIZE, dims.first / PATCH_SIZE}); + skip_vision_start.push_back(0); + } + + prompt += conditioner_params.text; + prompt += "<|im_end|>\n<|im_start|>assistant\n<|boi_token|><|tms_token|>"; + auto input_ids = tokenizer.encode(prompt, nullptr); + + std::vector input_ids_pad = input_ids; + input_ids_pad.push_back(VISION_START_TOKEN_ID); + input_ids_pad.insert(input_ids_pad.end(), target_image_len - 1, IMAGE_TOKEN_ID); + image_grids.push_back({1, static_cast(height / PATCH_SIZE), static_cast(width / PATCH_SIZE)}); + skip_vision_start.push_back(1); + + int64_t total_ref_len = 0; + for (const auto& ref_image : result.c_ref_images) { + int64_t ref_len = static_cast(ref_image.shape()[0] / PATCH_SIZE) * static_cast(ref_image.shape()[1] / PATCH_SIZE); + total_ref_len += ref_len; + input_ids_pad.push_back(VISION_START_TOKEN_ID); + input_ids_pad.insert(input_ids_pad.end(), ref_len - 1, IMAGE_TOKEN_ID); + image_grids.push_back({1, static_cast(ref_image.shape()[1] / PATCH_SIZE), static_cast(ref_image.shape()[0] / PATCH_SIZE)}); + skip_vision_start.push_back(1); + } + + std::vector token_types(input_ids_pad.size(), 0); + int txt_seq_len = static_cast(input_ids.size()); + int bgn = txt_seq_len - TIMESTEP_TOKEN_NUM; + int end = bgn + static_cast(target_image_len) + TIMESTEP_TOKEN_NUM; + for (int i = bgn; i < end; ++i) { + token_types[i] = 1; + } + for (int i = end; i < end + total_ref_len; ++i) { + token_types[i] = 2; + } + for (int i = txt_seq_len - TIMESTEP_TOKEN_NUM; i < txt_seq_len; ++i) { + token_types[i] = 3; + } + + std::vector input_shape{static_cast(input_ids.size())}; + std::vector position_shape{static_cast(input_ids_pad.size() * 4)}; + std::vector token_type_shape{static_cast(token_types.size())}; + std::vector image_range_shape{2, static_cast(image_ranges.size() / 2)}; + std::vector vinput_mask(token_types.size(), 0); + for (int i = txt_seq_len; i < static_cast(vinput_mask.size()); ++i) { + vinput_mask[static_cast(i)] = 1; + } + std::vector vinput_mask_shape{static_cast(vinput_mask.size())}; + + result.c_input_ids = sd::Tensor(input_shape, std::move(input_ids)); + result.c_position_ids = sd::Tensor(position_shape, build_position_ids(input_ids_pad, image_grids, skip_vision_start)); + result.c_token_types = sd::Tensor(token_type_shape, std::move(token_types)); + result.c_image_embed_ranges = sd::Tensor(image_range_shape, std::move(image_ranges)); + result.c_vinput_mask = sd::Tensor(vinput_mask_shape, std::move(vinput_mask)); + return result; + } + }; +} // namespace HiDreamO1 + +#endif // __SD_HIDREAM_O1_H__ diff --git a/src/llm.hpp b/src/llm.hpp index a67b4ebf3..cc6bf4170 100644 --- a/src/llm.hpp +++ b/src/llm.hpp @@ -27,6 +27,7 @@ namespace LLM { enum class LLMArch { QWEN2_5_VL, QWEN3, + QWEN3_VL, MISTRAL_SMALL_3_2, MINISTRAL_3_3B, ARCH_COUNT, @@ -35,6 +36,7 @@ namespace LLM { static const char* llm_arch_to_str[] = { "qwen2.5vl", "qwen3", + "qwen3vl", "mistral_small3.2", "ministral3.3b", }; @@ -430,6 +432,10 @@ namespace LLM { } else if (arch == LLMArch::QWEN3) { q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); + } else if (arch == LLMArch::QWEN3_VL) { + int sections[4] = {24, 20, 20, 0}; + q = ggml_rope_multi(ctx->ggml_ctx, q, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_IMROPE, 262144, 5000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); + k = ggml_rope_multi(ctx->ggml_ctx, k, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_IMROPE, 262144, 5000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); } else { int sections[4] = {16, 24, 24, 0}; q = ggml_rope_multi(ctx->ggml_ctx, q, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_MROPE, 128000, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); @@ -485,10 +491,11 @@ namespace LLM { struct TextModel : public GGMLBlock { protected: int64_t num_layers; + LLMParams params; public: TextModel(const LLMParams& params) - : num_layers(params.num_layers) { + : num_layers(params.num_layers), params(params) { blocks["embed_tokens"] = std::shared_ptr(new Embedding(params.vocab_size, params.hidden_size)); for (int i = 0; i < num_layers; i++) { blocks["layers." + std::to_string(i)] = std::shared_ptr(new TransformerBlock(params)); @@ -496,62 +503,63 @@ namespace LLM { blocks["norm"] = std::shared_ptr(new RMSNorm(params.hidden_size, params.rms_norm_eps)); } - ggml_tensor* forward(GGMLRunnerContext* ctx, - ggml_tensor* input_ids, - ggml_tensor* input_pos, - ggml_tensor* attention_mask, - std::vector> image_embeds, - std::set out_layers) { - // input_ids: [N, n_token] - // return: [N, n_token, hidden_size] - + ggml_tensor* embed(GGMLRunnerContext* ctx, + ggml_tensor* input_ids) { auto embed_tokens = std::dynamic_pointer_cast(blocks["embed_tokens"]); - auto norm = std::dynamic_pointer_cast(blocks["norm"]); - - auto x = embed_tokens->forward(ctx, input_ids); - sd::ggml_graph_cut::mark_graph_cut(x, "llm.text.prelude", "x"); - - std::vector intermediate_outputs; - - if (image_embeds.size() > 0) { - GGML_ASSERT(x->ne[2] == 1); // N == 1 + auto x = embed_tokens->forward(ctx, input_ids); + return x; + } - auto raw_x = ggml_cast(ctx->ggml_ctx, x, image_embeds[0].second->type); - int64_t txt_token_start = 0; - int64_t txt_token_end = 0; + ggml_tensor* splice_image_embeds(GGMLRunnerContext* ctx, + ggml_tensor* x, + std::vector> image_embeds) { + if (image_embeds.empty()) { + return x; + } - ggml_tensor* input_embed = nullptr; + GGML_ASSERT(x->ne[2] == 1); // N == 1 - for (int i = 0; i < image_embeds.size(); i++) { - if (i == 0) { - txt_token_start = 0; - } else { - txt_token_start = image_embeds[i - 1].first + image_embeds[i - 1].second->ne[1]; - } - txt_token_end = image_embeds[i].first; + auto raw_x = ggml_cast(ctx->ggml_ctx, x, image_embeds[0].second->type); + int64_t txt_token_start = 0; + int64_t txt_token_end = 0; + ggml_tensor* input_embed = nullptr; - auto txt_embed = ggml_ext_slice(ctx->ggml_ctx, raw_x, 1, txt_token_start, txt_token_end); - if (input_embed == nullptr) { - input_embed = txt_embed; - } else { - input_embed = ggml_concat(ctx->ggml_ctx, input_embed, txt_embed, 1); - } + for (int i = 0; i < image_embeds.size(); i++) { + if (i == 0) { + txt_token_start = 0; + } else { + txt_token_start = image_embeds[i - 1].first + image_embeds[i - 1].second->ne[1]; + } + txt_token_end = image_embeds[i].first; - auto image_embed = image_embeds[i].second; - input_embed = ggml_concat(ctx->ggml_ctx, input_embed, image_embed, 1); + auto txt_embed = ggml_ext_slice(ctx->ggml_ctx, raw_x, 1, txt_token_start, txt_token_end); + if (input_embed == nullptr) { + input_embed = txt_embed; + } else { + input_embed = ggml_concat(ctx->ggml_ctx, input_embed, txt_embed, 1); } - txt_token_start = image_embeds[image_embeds.size() - 1].first + image_embeds[image_embeds.size() - 1].second->ne[1]; - txt_token_end = raw_x->ne[1]; + input_embed = ggml_concat(ctx->ggml_ctx, input_embed, image_embeds[i].second, 1); + } - auto final_txt_embed = ggml_ext_slice(ctx->ggml_ctx, raw_x, 1, txt_token_start, txt_token_end); + txt_token_start = image_embeds[image_embeds.size() - 1].first + image_embeds[image_embeds.size() - 1].second->ne[1]; + txt_token_end = raw_x->ne[1]; - input_embed = ggml_concat(ctx->ggml_ctx, input_embed, final_txt_embed, 1); - GGML_ASSERT(raw_x->ne[1] == input_embed->ne[1]); + auto final_txt_embed = ggml_ext_slice(ctx->ggml_ctx, raw_x, 1, txt_token_start, txt_token_end); + input_embed = ggml_concat(ctx->ggml_ctx, input_embed, final_txt_embed, 1); + GGML_ASSERT(raw_x->ne[1] == input_embed->ne[1]); + return input_embed; + } - x = input_embed; - } + ggml_tensor* forward_embeds(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* input_pos, + ggml_tensor* attention_mask, + std::set out_layers) { + auto norm = std::dynamic_pointer_cast(blocks["norm"]); + std::vector intermediate_outputs; + sd::ggml_graph_cut::mark_graph_cut(x, "llm.text.prelude", "x"); for (int i = 0; i < num_layers; i++) { auto block = std::dynamic_pointer_cast(blocks["layers." + std::to_string(i)]); @@ -570,10 +578,23 @@ namespace LLM { for (int i = 1; i < intermediate_outputs.size(); i++) { x = ggml_concat(ctx->ggml_ctx, x, intermediate_outputs[i], 0); } - } else { - x = norm->forward(ctx, x); + return x; } - return x; + + return norm->forward(ctx, x); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* input_ids, + ggml_tensor* input_pos, + ggml_tensor* attention_mask, + std::vector> image_embeds, + std::set out_layers) { + // input_ids: [N, n_token] + // return: [N, n_token, hidden_size] + auto x = embed(ctx, input_ids); + x = splice_image_embeds(ctx, x, std::move(image_embeds)); + return forward_embeds(ctx, x, input_pos, attention_mask, std::move(out_layers)); } }; diff --git a/src/model.cpp b/src/model.cpp index 8fdde3b76..57b45d3cd 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -437,6 +437,10 @@ SDVersion ModelLoader::get_sd_version() { if (tensor_storage.name.find("model.diffusion_model.joint_blocks.") != std::string::npos) { return VERSION_SD3; } + if (tensor_storage.name.find("model.x_embedder.proj1.weight") != std::string::npos && + tensor_storage_map.find("model.language_model.layers.0.self_attn.q_proj.weight") != tensor_storage_map.end()) { + return VERSION_HIDREAM_O1; + } if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.img_mod.1.weight") != std::string::npos) { return VERSION_QWEN_IMAGE; } diff --git a/src/model.h b/src/model.h index 65bc6c367..212c42227 100644 --- a/src/model.h +++ b/src/model.h @@ -42,6 +42,7 @@ enum SDVersion { VERSION_ANIMA, VERSION_FLUX2, VERSION_FLUX2_KLEIN, + VERSION_HIDREAM_O1, VERSION_Z_IMAGE, VERSION_OVIS_IMAGE, VERSION_ERNIE_IMAGE, @@ -163,6 +164,7 @@ static inline bool sd_version_is_dit(SDVersion version) { sd_version_is_sd3(version) || sd_version_is_wan(version) || sd_version_is_qwen_image(version) || + version == VERSION_HIDREAM_O1 || sd_version_is_anima(version) || sd_version_is_z_image(version) || sd_version_is_ernie_image(version)) { diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index fd439ff1d..e4d5a7b60 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -52,6 +52,7 @@ const char* model_version_to_str[] = { "Anima", "Flux.2", "Flux.2 klein", + "HiDream O1", "Z-Image", "Ovis Image", "Ernie Image", @@ -491,6 +492,12 @@ class StableDiffusionGGML { "model.diffusion_model", version, sd_ctx_params->qwen_image_zero_cond_t); + } else if (version == VERSION_HIDREAM_O1) { + cond_stage_model = std::make_shared(); + diffusion_model = std::make_shared(backend, + offload_params_to_cpu, + tensor_storage_map, + "model"); } else if (sd_version_is_anima(version)) { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, @@ -625,7 +632,7 @@ class StableDiffusionGGML { } }; - if (version == VERSION_CHROMA_RADIANCE) { + if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1) { LOG_INFO("using FakeVAE"); first_stage_model = std::make_shared(version, vae_backend, @@ -796,6 +803,9 @@ class StableDiffusionGGML { ignore_tensors.insert("text_encoders.llm.vision_tower."); ignore_tensors.insert("text_encoders.llm.multi_modal_projector."); } + if (version == VERSION_HIDREAM_O1) { + ignore_tensors.insert("lm_head."); + } bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap); if (!success) { LOG_ERROR("load tensors from model loader failed"); @@ -898,6 +908,7 @@ class StableDiffusionGGML { } else if (sd_version_is_sd3(version) || sd_version_is_wan(version) || sd_version_is_qwen_image(version) || + version == VERSION_HIDREAM_O1 || sd_version_is_anima(version) || sd_version_is_ernie_image(version) || sd_version_is_z_image(version)) { @@ -1495,6 +1506,9 @@ class StableDiffusionGGML { if (sd_version_is_anima(version)) { return std::vector{t / static_cast(TIMESTEPS)}; } + if (version == VERSION_HIDREAM_O1) { + return std::vector{1.0f - (t / static_cast(TIMESTEPS))}; + } if (sd_version_is_z_image(version)) { return std::vector{1000.f - t}; } @@ -1607,6 +1621,10 @@ class StableDiffusionGGML { LOG_WARN("SLG is incompatible with this model type"); } + if (version == VERSION_HIDREAM_O1 && !noise.empty()) { + noise *= eta; + } + int64_t t0 = ggml_time_us(); sd::Tensor x_t = !noise.empty() ? denoiser->noise_scaling(sigmas[0], noise, init_latent) @@ -1679,12 +1697,19 @@ class StableDiffusionGGML { auto run_condition = [&](const SDCondition& condition, const sd::Tensor* c_concat_override = nullptr, const std::vector* local_skip_layers = nullptr) -> sd::Tensor { - diffusion_params.context = condition.c_crossattn.empty() ? nullptr : &condition.c_crossattn; - diffusion_params.c_concat = c_concat_override != nullptr ? c_concat_override : (condition.c_concat.empty() ? nullptr : &condition.c_concat); - diffusion_params.y = condition.c_vector.empty() ? nullptr : &condition.c_vector; - diffusion_params.t5_ids = condition.c_t5_ids.empty() ? nullptr : &condition.c_t5_ids; - diffusion_params.t5_weights = condition.c_t5_weights.empty() ? nullptr : &condition.c_t5_weights; - diffusion_params.skip_layers = local_skip_layers; + diffusion_params.context = condition.c_crossattn.empty() ? nullptr : &condition.c_crossattn; + diffusion_params.c_concat = c_concat_override != nullptr ? c_concat_override : (condition.c_concat.empty() ? nullptr : &condition.c_concat); + diffusion_params.y = condition.c_vector.empty() ? nullptr : &condition.c_vector; + diffusion_params.t5_ids = condition.c_t5_ids.empty() ? nullptr : &condition.c_t5_ids; + diffusion_params.t5_weights = condition.c_t5_weights.empty() ? nullptr : &condition.c_t5_weights; + diffusion_params.input_ids = condition.c_input_ids.empty() ? nullptr : &condition.c_input_ids; + diffusion_params.input_pos = condition.c_position_ids.empty() ? nullptr : &condition.c_position_ids; + diffusion_params.token_types = condition.c_token_types.empty() ? nullptr : &condition.c_token_types; + diffusion_params.image_embed_ranges = condition.c_image_embed_ranges.empty() ? nullptr : &condition.c_image_embed_ranges; + diffusion_params.vinput_mask = condition.c_vinput_mask.empty() ? nullptr : &condition.c_vinput_mask; + diffusion_params.vlm_images = condition.c_vlm_images.empty() ? nullptr : &condition.c_vlm_images; + diffusion_params.ref_latents = condition.c_ref_images.empty() ? &ref_latents : &condition.c_ref_images; + diffusion_params.skip_layers = local_skip_layers; sd::Tensor cached_output; if (step_cache.before_condition(&condition, noised_input, &cached_output)) { @@ -1831,6 +1856,8 @@ class StableDiffusionGGML { if (sd_version_is_dit(version)) { if (version == VERSION_WAN2_2_TI2V) { latent_channel = 48; + } else if (version == VERSION_HIDREAM_O1) { + latent_channel = 3; } else if (version == VERSION_CHROMA_RADIANCE) { latent_channel = 3; } else if (sd_version_uses_flux2_vae(version)) { @@ -2518,6 +2545,9 @@ static float resolve_eta(sd_ctx_t* sd_ctx, float eta, enum sample_method_t sample_method) { if (eta == INFINITY) { + if (sd_ctx->sd->version == VERSION_HIDREAM_O1) { + return 8.f; + } switch (sample_method) { case DDIM_TRAILING_SAMPLE_METHOD: case TCD_SAMPLE_METHOD: @@ -3009,6 +3039,9 @@ static std::optional prepare_image_generation_latents(sd std::vector> ref_latents; for (size_t i = 0; i < ref_images.size(); i++) { + if (sd_ctx->sd->version == VERSION_HIDREAM_O1) { + continue; + } sd::Tensor ref_latent; if (request->auto_resize_ref_image) { LOG_DEBUG("auto resize ref images"); diff --git a/src/tokenizers/qwen2_tokenizer.cpp b/src/tokenizers/qwen2_tokenizer.cpp index 9929ea387..46ee31172 100644 --- a/src/tokenizers/qwen2_tokenizer.cpp +++ b/src/tokenizers/qwen2_tokenizer.cpp @@ -81,6 +81,11 @@ Qwen2Tokenizer::Qwen2Tokenizer(const std::string& merges_utf8_str) { "", "", "", + "<|boi_token|>", + "<|bor_token|>", + "<|eor_token|>", + "<|bot_token|>", + "<|tms_token|>", }; if (merges_utf8_str.size() > 0) { diff --git a/src/vae.hpp b/src/vae.hpp index 54bd88abf..35e73e41f 100644 --- a/src/vae.hpp +++ b/src/vae.hpp @@ -71,7 +71,7 @@ struct VAE : public GGMLRunner { scale_factor = 16; } else if (sd_version_uses_flux2_vae(version)) { scale_factor = 16; - } else if (version == VERSION_CHROMA_RADIANCE) { + } else if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1) { scale_factor = 1; } return scale_factor; From e56295d180c1321214e7f3240b61cb23ef6cbee7 Mon Sep 17 00:00:00 2001 From: leejet Date: Mon, 11 May 2026 01:20:06 +0800 Subject: [PATCH 2/2] add euler flow flash sample method --- include/stable-diffusion.h | 1 + src/denoiser.hpp | 27 +++++++++++++++++++++++++++ src/stable-diffusion.cpp | 2 ++ 3 files changed, 30 insertions(+) diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index c4c14949c..3596c20c5 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -37,6 +37,7 @@ enum rng_type_t { enum sample_method_t { EULER_SAMPLE_METHOD, + EULER_FLOW_FLASH_SAMPLE_METHOD, EULER_A_SAMPLE_METHOD, HEUN_SAMPLE_METHOD, DPM2_SAMPLE_METHOD, diff --git a/src/denoiser.hpp b/src/denoiser.hpp index 3f08706da..e7a25a6d5 100644 --- a/src/denoiser.hpp +++ b/src/denoiser.hpp @@ -867,6 +867,31 @@ static sd::Tensor sample_euler_flow(denoise_cb_t model, return x; } +static sd::Tensor sample_euler_flow_flash(denoise_cb_t model, + sd::Tensor x, + const std::vector& sigmas, + std::shared_ptr rng, + float eta) { + float s_noise = eta; + int steps = static_cast(sigmas.size()) - 1; + for (int i = 0; i < steps; i++) { + float sigma = sigmas[i]; + float sigma_next = sigmas[i + 1]; + auto denoised_opt = model(x, sigma, i + 1); + if (denoised_opt.empty()) { + return {}; + } + sd::Tensor denoised = std::move(denoised_opt); + if (sigma_next == 0.0f) { + x = std::move(denoised); + continue; + } + auto noise = sd::Tensor::randn_like(x, rng); + x = sigma_next * noise * s_noise + (1.0f - sigma_next) * denoised; + } + return x; +} + static sd::Tensor sample_euler(denoise_cb_t model, sd::Tensor x, const std::vector& sigmas) { @@ -1658,6 +1683,8 @@ static sd::Tensor sample_k_diffusion(sample_method_t method, float eta, bool is_flow_denoiser) { switch (method) { + case EULER_FLOW_FLASH_SAMPLE_METHOD: + return sample_euler_flow_flash(model, std::move(x), sigmas, rng, eta); case EULER_A_SAMPLE_METHOD: if (is_flow_denoiser) return sample_euler_flow(model, std::move(x), sigmas, rng, eta); diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index e4d5a7b60..ba2b3ea64 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -60,6 +60,7 @@ const char* model_version_to_str[] = { const char* sampling_methods_str[] = { "Euler", + "Euler Flow Flash", "Euler A", "Heun", "DPM2", @@ -1978,6 +1979,7 @@ enum rng_type_t str_to_rng_type(const char* str) { const char* sample_method_to_str[] = { "euler", + "euler_flow_flash", "euler_a", "heun", "dpm2",