Skip to content
Merged
4 changes: 2 additions & 2 deletions src/denoiser.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1289,8 +1289,8 @@ static sd::Tensor<float> sample_res_multistep(denoise_cb_t model,
}
sd::Tensor<float> denoised = std::move(denoised_opt);

float sigma_from = sigmas[i];
float sigma_to = sigmas[i + 1];
float sigma_from = sigmas[i];
float sigma_to = sigmas[i + 1];

auto [sigma_down, sigma_up, alpha_scale] = get_ancestral_step(sigma_from, sigma_to, eta, is_flow_denoiser);

Expand Down
19 changes: 18 additions & 1 deletion src/ggml_extend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2567,7 +2567,24 @@ struct GGMLRunner {

bool alloc_params_buffer() {
size_t num_tensors = ggml_tensor_num(params_ctx);
params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend);
if (num_tensors > 0) {
// ggml_backend_alloc_ctx_tensors fails when all tensors are already allocated
// (typical for memory-mapped weights). See ggml-alloc.c n_buffers==0 branch.
bool all_have_data = true;
for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) {
if (t->data == nullptr) {
all_have_data = false;
break;
}
}
if (all_have_data) {
LOG_DEBUG("%s all params already mmap-allocated (no separate buffer needed)", get_desc().c_str());
params_buffer = nullptr;
rebuild_params_tensor_set();
return true;
}
}
params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend);
if (params_buffer == nullptr) {
LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i",
get_desc().c_str(),
Expand Down
195 changes: 166 additions & 29 deletions src/model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -730,16 +730,10 @@ void ModelLoader::set_wtype_override(ggml_type wtype, std::string tensor_type_ru
}
}

bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p, bool enable_mmap) {
int64_t process_time_ms = 0;
std::atomic<int64_t> read_time_ms(0);
std::atomic<int64_t> memcpy_time_ms(0);
std::atomic<int64_t> copy_to_backend_time_ms(0);
std::atomic<int64_t> convert_time_ms(0);
std::atomic<uint64_t> bytes_processed(0);

int num_threads_to_use = n_threads_p > 0 ? n_threads_p : sd_get_num_physical_cores();
LOG_DEBUG("using %d threads for model loading", num_threads_to_use);
void ModelLoader::process_model_files(bool enable_mmap, bool writable_mmap) {
if (model_files_processed) {
return;
}

int64_t start_time = ggml_time_ms();

Expand All @@ -751,22 +745,13 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
processed_tensor_storages.push_back(tensor_storage);
}

process_time_ms = ggml_time_ms() - start_time;

bool success = true;
size_t total_tensors_processed = 0;
const size_t total_tensors_to_process = processed_tensor_storages.size();
const int64_t t_start = ggml_time_ms();
int last_n_threads = 1;

for (size_t file_index = 0; file_index < file_paths_.size(); file_index++) {
std::string file_path = file_paths_[file_index];
LOG_DEBUG("loading tensors from %s", file_path.c_str());

std::vector<const TensorStorage*> file_tensors;
std::vector<TensorStorage> file_tensors;
for (const auto& ts : processed_tensor_storages) {
if (ts.file_index == file_index) {
file_tensors.push_back(&ts);
file_tensors.push_back(ts);
}
}
if (file_tensors.empty()) {
Expand All @@ -775,21 +760,169 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread

bool is_zip = false;
for (auto const& ts : file_tensors) {
if (ts->index_in_zip >= 0) {
if (ts.index_in_zip >= 0) {
is_zip = true;
break;
}
}

std::unique_ptr<MmapWrapper> mmapped;
ModelFileData fdata = {};
fdata.path = file_path;
fdata.is_zip = is_zip;
fdata.tensors = std::move(file_tensors);

if (enable_mmap && !is_zip) {
LOG_DEBUG("using mmap for I/O");
mmapped = MmapWrapper::create(file_path);
if (!mmapped) {
LOG_WARN("failed to memory-map '%s'", file_path.c_str());
std::unique_ptr<MmapWrapper> mmapped = MmapWrapper::create(file_path, writable_mmap);
if (mmapped) {
uint8_t* mmap_data = static_cast<uint8_t*>(mmapped->writable_data());
ggml_backend_buffer_t buf_mmap = ggml_backend_cpu_buffer_from_ptr(mmap_data, mmapped->size());
if (buf_mmap) {
LOG_INFO("using mmap for '%s'", file_path.c_str());
fdata.mmbuffer = std::shared_ptr<struct ggml_backend_buffer>(buf_mmap, ggml_backend_buffer_free);
} else {
LOG_WARN("mmap: failed to create backend buffer for file %s", fdata.path.c_str());
}
fdata.mmapped = std::shared_ptr<MmapWrapper>(std::move(mmapped));
} else {
LOG_WARN("failed to memory-map '%s' (falling back to read())", file_path.c_str());
}
} else if (!is_zip) {
LOG_INFO("NOT using mmap for '%s' (mmap disabled by caller)",
file_path.c_str());
}

file_data.push_back(std::move(fdata));
}

model_files_processed = true;

int64_t end_time = ggml_time_ms();
int64_t process_time_ms = end_time - start_time;

LOG_INFO("model files processing completed in %.2fs", process_time_ms / 1000.f);
}

std::vector<MmapTensorStore> ModelLoader::mmap_tensors(std::map<std::string, ggml_tensor*>& tensors,
std::set<std::string> ignore_tensors,
bool writable_mmap) {
process_model_files(true, writable_mmap);

std::vector<MmapTensorStore> result;
uint64_t mapped_bytes = 0;
size_t mapped_tensors = 0;

LOG_DEBUG("memory-mapping tensors...");

int64_t t_start = ggml_time_ms();

for (auto& fdata : file_data) {
if (!fdata.mmbuffer)
continue;

const std::vector<TensorStorage>& file_tensors = fdata.tensors;

size_t file_mapped_bytes = 0;
size_t file_mapped_tensors = 0;

for (const auto& tensor_storage : file_tensors) {
const std::string& name = tensor_storage.name;

bool is_ignored = false;
for (const auto& ignore_prefix : ignore_tensors) {
if (starts_with(name, ignore_prefix)) {
is_ignored = true;
break;
}
}
if (is_ignored)
continue;

auto it = tensors.find(name);
if (it == tensors.end())
continue;

ggml_tensor* dst_tensor = it->second;
if (dst_tensor == nullptr)
continue;

if (tensor_storage.type != dst_tensor->type)
continue;

size_t tensor_size = tensor_storage.nbytes();
size_t tensor_offset = tensor_storage.offset;

if (tensor_storage.ne[0] != dst_tensor->ne[0] ||
tensor_storage.ne[1] != dst_tensor->ne[1] ||
tensor_storage.ne[2] != dst_tensor->ne[2] ||
tensor_storage.ne[3] != dst_tensor->ne[3] ||
tensor_size != ggml_nbytes(dst_tensor)) {
// let load_tensors worry about this
continue;
}

ggml_backend_buffer_t buf_mmap = fdata.mmbuffer.get();
uint8_t* mmap_data = static_cast<uint8_t*>(ggml_backend_buffer_get_base(buf_mmap));
dst_tensor->buffer = buf_mmap;
dst_tensor->data = mmap_data + tensor_offset;

file_mapped_bytes += tensor_size;
file_mapped_tensors++;
}

if (file_mapped_bytes > 0) {
mapped_tensors += file_mapped_tensors;
mapped_bytes += file_mapped_bytes;
result.push_back({fdata.mmapped, fdata.mmbuffer});
}
}

int64_t t_end = ggml_time_ms();
int64_t duration_ms = t_end - t_start;

LOG_INFO("memory-mapped %zu tensors in %zu files (%.2f MB), taking %.2fs",
mapped_tensors,
result.size(),
mapped_bytes / (1024.0 * 1024.0),
duration_ms / 1000.0);

return result;
}

bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p, bool enable_mmap) {
process_model_files(enable_mmap, false);

std::atomic<int64_t> read_time_ms(0);
std::atomic<int64_t> memcpy_time_ms(0);
std::atomic<int64_t> copy_to_backend_time_ms(0);
std::atomic<int64_t> convert_time_ms(0);
std::atomic<uint64_t> bytes_processed(0);

int num_threads_to_use = n_threads_p > 0 ? n_threads_p : sd_get_num_physical_cores();
LOG_DEBUG("using %d threads for model loading", num_threads_to_use);

int64_t start_time = ggml_time_ms();

size_t total_tensors_to_process = 0;
for (const auto& fdata : file_data) {
total_tensors_to_process += fdata.tensors.size();
}

bool success = true;
size_t total_tensors_processed = 0;
const int64_t t_start = start_time;
int last_n_threads = 1;

for (auto& fdata : file_data) {
const std::string& file_path = fdata.path;
LOG_DEBUG("loading tensors from %s", file_path.c_str());

const std::vector<TensorStorage>& file_tensors = fdata.tensors;

bool is_zip = fdata.is_zip;

std::shared_ptr<MmapWrapper> mmapped = fdata.mmapped;

int n_threads = is_zip ? 1 : std::min(num_threads_to_use, (int)file_tensors.size());
if (n_threads < 1) {
n_threads = 1;
Expand Down Expand Up @@ -830,7 +963,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
break;
}

const TensorStorage& tensor_storage = *file_tensors[idx];
const TensorStorage& tensor_storage = file_tensors[idx];
ggml_tensor* dst_tensor = nullptr;

t0 = ggml_time_ms();
Expand All @@ -847,6 +980,11 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
continue;
}

// skip mmapped tensors
if (dst_tensor->buffer != nullptr && dst_tensor->buffer == fdata.mmbuffer.get()) {
continue;
}

size_t nbytes_to_read = tensor_storage.nbytes_to_read();

auto read_data = [&](char* buf, size_t n) {
Expand Down Expand Up @@ -990,9 +1128,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
}

int64_t end_time = ggml_time_ms();
LOG_INFO("loading tensors completed, taking %.2fs (process: %.2fs, read: %.2fs, memcpy: %.2fs, convert: %.2fs, copy_to_backend: %.2fs)",
LOG_INFO("loading tensors completed, taking %.2fs (read: %.2fs, memcpy: %.2fs, convert: %.2fs, copy_to_backend: %.2fs)",
(end_time - start_time) / 1000.f,
process_time_ms / 1000.f,
(read_time_ms.load() / (float)last_n_threads) / 1000.f,
(memcpy_time_ms.load() / (float)last_n_threads) / 1000.f,
(convert_time_ms.load() / (float)last_n_threads) / 1000.f,
Expand Down
21 changes: 21 additions & 0 deletions src/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -193,10 +193,27 @@ using TensorTypeRules = std::vector<std::pair<std::string, ggml_type>>;

TensorTypeRules parse_tensor_type_rules(const std::string& tensor_type_rules);

class MmapWrapper;

struct ModelFileData {
std::string path;
std::vector<TensorStorage> tensors;
std::shared_ptr<MmapWrapper> mmapped;
std::shared_ptr<struct ggml_backend_buffer> mmbuffer;
bool is_zip;
};

struct MmapTensorStore {
std::shared_ptr<MmapWrapper> mmapped;
std::shared_ptr<struct ggml_backend_buffer> mmbuffer;
};

class ModelLoader {
protected:
SDVersion version_ = VERSION_COUNT;
std::vector<std::string> file_paths_;
std::vector<ModelFileData> file_data;
bool model_files_processed = false;
String2TensorStorage tensor_storage_map;

void add_tensor_storage(const TensorStorage& tensor_storage);
Expand All @@ -220,6 +237,10 @@ class ModelLoader {
std::map<ggml_type, uint32_t> get_vae_wtype_stat();
String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; }
void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = "");
void process_model_files(bool enable_mmap = false, bool writable_mmap = true);
std::vector<MmapTensorStore> mmap_tensors(std::map<std::string, ggml_tensor*>& tensors,
std::set<std::string> ignore_tensors = {},
bool writable = true);
bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0, bool use_mmap = false);
bool load_tensors(std::map<std::string, ggml_tensor*>& tensors,
std::set<std::string> ignore_tensors = {},
Expand Down
Loading
Loading