Skip to content

Image preview #522

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 22 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ test/
*.gguf
output*.png
models*
*.log
*.log
preview.png
85 changes: 72 additions & 13 deletions examples/cli/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,13 @@ const char* modes_str[] = {
"convert",
};

const char* previews_str[] = {
"none",
"proj",
"tae",
"vae",
};

enum SDMode {
TXT2IMG,
IMG2IMG,
Expand Down Expand Up @@ -129,6 +136,11 @@ struct SDParams {
float slg_scale = 0.;
float skip_layer_start = 0.01;
float skip_layer_end = 0.2;

sd_preview_t preview_method = SD_PREVIEW_NONE;
int preview_interval = 1;
std::string preview_path = "preview.png";
bool taesd_preview = false;
};

void print_params(SDParams params) {
Expand Down Expand Up @@ -174,27 +186,30 @@ void print_params(SDParams params) {
printf(" sample_steps: %d\n", params.sample_steps);
printf(" strength(img2img): %.2f\n", params.strength);
printf(" rng: %s\n", rng_type_to_str[params.rng_type]);
printf(" seed: %ld\n", params.seed);
printf(" seed: %lld\n", params.seed);
printf(" batch_count: %d\n", params.batch_count);
printf(" vae_tiling: %s\n", params.vae_tiling ? "true" : "false");
printf(" upscale_repeats: %d\n", params.upscale_repeats);
printf(" preview_mode: %s\n", previews_str[params.preview_method]);
printf(" preview_interval: %d\n", params.preview_interval);
}

void print_usage(int argc, const char* argv[]) {
printf("usage: %s [arguments]\n", argv[0]);
printf("\n");
printf("arguments:\n");
printf(" -h, --help show this help message and exit\n");
printf(" -M, --mode [MODEL] run mode (txt2img or img2img or convert, default: txt2img)\n");
printf(" -M, --mode [MODE] run mode (txt2img or img2img or convert, default: txt2img)\n");
printf(" -t, --threads N number of threads to use during computation (default: -1)\n");
printf(" If threads <= 0, then threads will be set to the number of CPU physical cores\n");
printf(" -m, --model [MODEL] path to full model\n");
printf(" --diffusion-model path to the standalone diffusion model\n");
printf(" --clip_l path to the clip-l text encoder\n");
printf(" --clip_g path to the clip-g text encoder\n");
printf(" --t5xxl path to the the t5xxl text encoder\n");
printf(" --diffusion-model [MODEL] path to the standalone diffusion model\n");
printf(" --clip_l [ENCODER] path to the clip-l text encoder\n");
printf(" --clip_g [ENCODER] path to the clip-g text encoder\n");
printf(" --t5xxl [ENCODER] path to the the t5xxl text encoder\n");
printf(" --vae [VAE] path to vae\n");
printf(" --taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
printf(" --taesd [TAESD] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
printf(" --taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview %s)\n", previews_str[SD_PREVIEW_TAE]);
printf(" --control-net [CONTROL_PATH] path to control net model\n");
printf(" --embd-dir [EMBEDDING_PATH] path to embeddings\n");
printf(" --stacked-id-embd-dir [DIR] path to PHOTOMAKER stacked id embeddings\n");
Expand Down Expand Up @@ -243,6 +258,10 @@ void print_usage(int argc, const char* argv[]) {
printf(" This might crash if it is not supported by the backend.\n");
printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n");
printf(" --canny apply canny preprocessor (edge detection)\n");
printf(" --preview {%s,%s,%s,%s} preview method. (default is %s(disabled))\n", previews_str[0], previews_str[1], previews_str[2], previews_str[3], previews_str[SD_PREVIEW_NONE]);
printf(" %s is the fastest\n", previews_str[SD_PREVIEW_PROJ]);
printf(" --preview-interval [N] How often to save the image preview");
printf(" --preview-path [PATH} path to write preview image to (default: ./preview.png)\n");
printf(" --color Colors the logging tags according to level\n");
printf(" -v, --verbose print extra info\n");
}
Expand Down Expand Up @@ -507,6 +526,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
params.diffusion_flash_attn = true; // can reduce MEM significantly
} else if (arg == "--canny") {
params.canny_preprocess = true;
} else if (arg == "--taesd-preview-only") {
params.taesd_preview = true;
} else if (arg == "-b" || arg == "--batch-count") {
if (++i >= argc) {
invalid_arg = true;
Expand Down Expand Up @@ -629,6 +650,35 @@ void parse_args(int argc, const char** argv, SDParams& params) {
break;
}
params.skip_layer_end = std::stof(argv[i]);
} else if (arg == "--preview") {
if (++i >= argc) {
invalid_arg = true;
break;
}
const char* preview = argv[i];
int preview_method = -1;
for (int m = 0; m < N_PREVIEWS; m++) {
if (!strcmp(preview, previews_str[m])) {
preview_method = m;
}
}
if (preview_method == -1) {
invalid_arg = true;
break;
}
params.preview_method = (sd_preview_t)preview_method;
} else if (arg == "--preview-interval") {
if (++i >= argc) {
invalid_arg = true;
break;
}
params.preview_interval = std::stoi(argv[i]);
} else if (arg == "--preview-path") {
if (++i >= argc) {
invalid_arg = true;
break;
}
params.preview_path = argv[i];
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
print_usage(argc, argv);
Expand Down Expand Up @@ -787,12 +837,20 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
fflush(out_stream);
}

const char* preview_path;

void step_callback(int step, sd_image_t image) {
stbi_write_png(preview_path, image.width, image.height, image.channel, image.data, 0);
}

int main(int argc, const char* argv[]) {
SDParams params;

parse_args(argc, argv, params);
preview_path = params.preview_path.c_str();

sd_set_log_callback(sd_log_cb, (void*)&params);
sd_set_preview_callback((sd_preview_cb_t)step_callback, params.preview_method, params.preview_interval);

if (params.verbose) {
print_params(params);
Expand Down Expand Up @@ -900,7 +958,8 @@ int main(int argc, const char* argv[]) {
params.clip_on_cpu,
params.control_net_cpu,
params.vae_on_cpu,
params.diffusion_flash_attn);
params.diffusion_flash_attn,
params.taesd_preview);

if (sd_ctx == NULL) {
printf("new_sd_ctx_t failed\n");
Expand Down Expand Up @@ -1075,19 +1134,19 @@ int main(int argc, const char* argv[]) {

std::string dummy_name, ext, lc_ext;
bool is_jpg;
size_t last = params.output_path.find_last_of(".");
size_t last = params.output_path.find_last_of(".");
size_t last_path = std::min(params.output_path.find_last_of("/"),
params.output_path.find_last_of("\\"));
if (last != std::string::npos // filename has extension
&& (last_path == std::string::npos || last > last_path)) {
if (last != std::string::npos // filename has extension
&& (last_path == std::string::npos || last > last_path)) {
dummy_name = params.output_path.substr(0, last);
ext = lc_ext = params.output_path.substr(last);
std::transform(ext.begin(), ext.end(), lc_ext.begin(), ::tolower);
is_jpg = lc_ext == ".jpg" || lc_ext == ".jpeg" || lc_ext == ".jpe";
} else {
dummy_name = params.output_path;
ext = lc_ext = "";
is_jpg = false;
is_jpg = false;
}
// appending ".png" to absent or unknown extension
if (!is_jpg && lc_ext != ".png") {
Expand All @@ -1099,7 +1158,7 @@ int main(int argc, const char* argv[]) {
continue;
}
std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ext : dummy_name + ext;
if(is_jpg) {
if (is_jpg) {
stbi_write_jpg(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
results[i].data, 90, get_image_params(params, params.seed + i).c_str());
printf("save result JPEG image to '%s'\n", final_image_path.c_str());
Expand Down
2 changes: 1 addition & 1 deletion ggml_extend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,7 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const
ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, tile_size * scale, tile_size * scale, output->ne[2], 1);
on_processing(input_tile, NULL, true);
int num_tiles = ceil((float)input_width / non_tile_overlap) * ceil((float)input_height / non_tile_overlap);
LOG_INFO("processing %i tiles", num_tiles);
LOG_DEBUG("processing %i tiles", num_tiles);
pretty_progress(1, num_tiles, 0.0f);
int tile_count = 1;
bool last_y = false, last_x = false;
Expand Down
83 changes: 83 additions & 0 deletions latent-preview.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@

// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169
const float flux_latent_rgb_proj[16][3] = {
{-0.0346f, 0.0244f, 0.0681f},
{0.0034f, 0.0210f, 0.0687f},
{0.0275f, -0.0668f, -0.0433f},
{-0.0174f, 0.0160f, 0.0617f},
{0.0859f, 0.0721f, 0.0329f},
{0.0004f, 0.0383f, 0.0115f},
{0.0405f, 0.0861f, 0.0915f},
{-0.0236f, -0.0185f, -0.0259f},
{-0.0245f, 0.0250f, 0.1180f},
{0.1008f, 0.0755f, -0.0421f},
{-0.0515f, 0.0201f, 0.0011f},
{0.0428f, -0.0012f, -0.0036f},
{0.0817f, 0.0765f, 0.0749f},
{-0.1264f, -0.0522f, -0.1103f},
{-0.0280f, -0.0881f, -0.0499f},
{-0.1262f, -0.0982f, -0.0778f}};

// https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py#L228-L246
const float sd3_latent_rgb_proj[16][3] = {
{-0.0645f, 0.0177f, 0.1052f},
{0.0028f, 0.0312f, 0.0650f},
{0.1848f, 0.0762f, 0.0360f},
{0.0944f, 0.0360f, 0.0889f},
{0.0897f, 0.0506f, -0.0364f},
{-0.0020f, 0.1203f, 0.0284f},
{0.0855f, 0.0118f, 0.0283f},
{-0.0539f, 0.0658f, 0.1047f},
{-0.0057f, 0.0116f, 0.0700f},
{-0.0412f, 0.0281f, -0.0039f},
{0.1106f, 0.1171f, 0.1220f},
{-0.0248f, 0.0682f, -0.0481f},
{0.0815f, 0.0846f, 0.1207f},
{-0.0120f, -0.0055f, -0.0867f},
{-0.0749f, -0.0634f, -0.0456f},
{-0.1418f, -0.1457f, -0.1259f},
};

// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38
const float sdxl_latent_rgb_proj[4][3] = {
{0.3651f, 0.4232f, 0.4341f},
{-0.2533f, -0.0042f, 0.1068f},
{0.1076f, 0.1111f, -0.0362f},
{-0.3165f, -0.2492f, -0.2188f}};

// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38
const float sd_latent_rgb_proj[4][3]{
{0.3512f, 0.2297f, 0.3227f},
{0.3250f, 0.4974f, 0.2350f},
{-0.2829f, 0.1762f, 0.2721f},
{-0.2120f, -0.2616f, -0.7177f}};

void preview_latent_image(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], int width, int height, int dim) {
size_t buffer_head = 0;
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
size_t latent_id = (i * latents->nb[0] + j * latents->nb[1]);
float r = 0, g = 0, b = 0;
for (int d = 0; d < dim; d++) {
float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[2]);
r += value * latent_rgb_proj[d][0];
g += value * latent_rgb_proj[d][1];
b += value * latent_rgb_proj[d][2];
}

// change range
r = r * .5f + .5f;
g = g * .5f + .5f;
b = b * .5f + .5f;

// clamp rgb values to [0,1] range
r = r >= 0 ? r <= 1 ? r : 1 : 0;
g = g >= 0 ? g <= 1 ? g : 1 : 0;
b = b >= 0 ? b <= 1 ? b : 1 : 0;

buffer[buffer_head++] = (uint8_t)(r * 255);
buffer[buffer_head++] = (uint8_t)(g * 255);
buffer[buffer_head++] = (uint8_t)(b * 255);
}
}
}
Loading
Loading