leejet · stduhpf · Oct 30, 2024 · Oct 30, 2024 · Nov 24, 2024 · Nov 24, 2024
diff --git a/.gitignore b/.gitignore
@@ -10,4 +10,5 @@ test/
 *.gguf
 output*.png
 models*
-*.log
+*.log
+preview.png
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
@@ -60,6 +60,13 @@ const char* modes_str[] = {
     "convert",
 };
 
+const char* previews_str[] = {
+    "none",
+    "proj",
+    "tae",
+    "vae",
+};
+
 enum SDMode {
     TXT2IMG,
     IMG2IMG,
@@ -129,6 +136,11 @@ struct SDParams {
     float slg_scale              = 0.;
     float skip_layer_start       = 0.01;
     float skip_layer_end         = 0.2;
+
+    sd_preview_t preview_method = SD_PREVIEW_NONE;
+    int preview_interval        = 1;
+    std::string preview_path    = "preview.png";
+    bool taesd_preview          = false;
 };
 
 void print_params(SDParams params) {
@@ -174,27 +186,30 @@ void print_params(SDParams params) {
     printf("    sample_steps:      %d\n", params.sample_steps);
     printf("    strength(img2img): %.2f\n", params.strength);
     printf("    rng:               %s\n", rng_type_to_str[params.rng_type]);
-    printf("    seed:              %ld\n", params.seed);
+    printf("    seed:              %lld\n", params.seed);
     printf("    batch_count:       %d\n", params.batch_count);
     printf("    vae_tiling:        %s\n", params.vae_tiling ? "true" : "false");
     printf("    upscale_repeats:   %d\n", params.upscale_repeats);
+    printf("    preview_mode:      %s\n", previews_str[params.preview_method]);
+    printf("    preview_interval:  %d\n", params.preview_interval);
 }
 
 void print_usage(int argc, const char* argv[]) {
     printf("usage: %s [arguments]\n", argv[0]);
     printf("\n");
     printf("arguments:\n");
     printf("  -h, --help                         show this help message and exit\n");
-    printf("  -M, --mode [MODEL]                 run mode (txt2img or img2img or convert, default: txt2img)\n");
+    printf("  -M, --mode [MODE]                  run mode (txt2img or img2img or convert, default: txt2img)\n");
     printf("  -t, --threads N                    number of threads to use during computation (default: -1)\n");
     printf("                                     If threads <= 0, then threads will be set to the number of CPU physical cores\n");
     printf("  -m, --model [MODEL]                path to full model\n");
-    printf("  --diffusion-model                  path to the standalone diffusion model\n");
-    printf("  --clip_l                           path to the clip-l text encoder\n");
-    printf("  --clip_g                           path to the clip-g text encoder\n");
-    printf("  --t5xxl                            path to the the t5xxl text encoder\n");
+    printf("  --diffusion-model [MODEL]          path to the standalone diffusion model\n");
+    printf("  --clip_l [ENCODER]                 path to the clip-l text encoder\n");
+    printf("  --clip_g [ENCODER]                 path to the clip-g text encoder\n");
+    printf("  --t5xxl [ENCODER]                  path to the the t5xxl text encoder\n");
     printf("  --vae [VAE]                        path to vae\n");
-    printf("  --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
+    printf("  --taesd [TAESD]                    path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
+    printf("  --taesd-preview-only               prevents usage of taesd for decoding the final image. (for use with --preview %s)\n", previews_str[SD_PREVIEW_TAE]);
     printf("  --control-net [CONTROL_PATH]       path to control net model\n");
     printf("  --embd-dir [EMBEDDING_PATH]        path to embeddings\n");
     printf("  --stacked-id-embd-dir [DIR]        path to PHOTOMAKER stacked id embeddings\n");
@@ -243,6 +258,10 @@ void print_usage(int argc, const char* argv[]) {
     printf("                                     This might crash if it is not supported by the backend.\n");
     printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
     printf("  --canny                            apply canny preprocessor (edge detection)\n");
+    printf("  --preview {%s,%s,%s,%s}            preview method. (default is %s(disabled))\n", previews_str[0], previews_str[1], previews_str[2], previews_str[3], previews_str[SD_PREVIEW_NONE]);
+    printf("                                     %s is the fastest\n", previews_str[SD_PREVIEW_PROJ]);
+    printf("  --preview-interval [N]             How often to save the image preview");
+    printf("  --preview-path [PATH}              path to write preview image to (default: ./preview.png)\n");
     printf("  --color                            Colors the logging tags according to level\n");
     printf("  -v, --verbose                      print extra info\n");
 }
@@ -507,6 +526,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
             params.diffusion_flash_attn = true;  // can reduce MEM significantly
         } else if (arg == "--canny") {
             params.canny_preprocess = true;
+        } else if (arg == "--taesd-preview-only") {
+            params.taesd_preview = true;
         } else if (arg == "-b" || arg == "--batch-count") {
             if (++i >= argc) {
                 invalid_arg = true;
@@ -629,6 +650,35 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 break;
             }
             params.skip_layer_end = std::stof(argv[i]);
+        } else if (arg == "--preview") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            const char* preview = argv[i];
+            int preview_method  = -1;
+            for (int m = 0; m < N_PREVIEWS; m++) {
+                if (!strcmp(preview, previews_str[m])) {
+                    preview_method = m;
+                }
+            }
+            if (preview_method == -1) {
+                invalid_arg = true;
+                break;
+            }
+            params.preview_method = (sd_preview_t)preview_method;
+        } else if (arg == "--preview-interval") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.preview_interval = std::stoi(argv[i]);
+        } else if (arg == "--preview-path") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.preview_path = argv[i];
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             print_usage(argc, argv);
@@ -787,12 +837,20 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
     fflush(out_stream);
 }
 
+const char* preview_path;
+
+void step_callback(int step, sd_image_t image) {
+    stbi_write_png(preview_path, image.width, image.height, image.channel, image.data, 0);
+}
+
 int main(int argc, const char* argv[]) {
     SDParams params;
 
     parse_args(argc, argv, params);
+    preview_path = params.preview_path.c_str();
 
     sd_set_log_callback(sd_log_cb, (void*)&params);
+    sd_set_preview_callback((sd_preview_cb_t)step_callback, params.preview_method, params.preview_interval);
 
     if (params.verbose) {
         print_params(params);
@@ -900,7 +958,8 @@ int main(int argc, const char* argv[]) {
                                   params.clip_on_cpu,
                                   params.control_net_cpu,
                                   params.vae_on_cpu,
-                                  params.diffusion_flash_attn);
+                                  params.diffusion_flash_attn,
+                                  params.taesd_preview);
 
     if (sd_ctx == NULL) {
         printf("new_sd_ctx_t failed\n");
@@ -1075,19 +1134,19 @@ int main(int argc, const char* argv[]) {
 
     std::string dummy_name, ext, lc_ext;
     bool is_jpg;
-    size_t last = params.output_path.find_last_of(".");
+    size_t last      = params.output_path.find_last_of(".");
     size_t last_path = std::min(params.output_path.find_last_of("/"),
                                 params.output_path.find_last_of("\\"));
-    if (last != std::string::npos // filename has extension
-    && (last_path == std::string::npos || last > last_path)) {
+    if (last != std::string::npos  // filename has extension
+        && (last_path == std::string::npos || last > last_path)) {
         dummy_name = params.output_path.substr(0, last);
         ext = lc_ext = params.output_path.substr(last);
         std::transform(ext.begin(), ext.end(), lc_ext.begin(), ::tolower);
         is_jpg = lc_ext == ".jpg" || lc_ext == ".jpeg" || lc_ext == ".jpe";
     } else {
         dummy_name = params.output_path;
         ext = lc_ext = "";
-        is_jpg = false;
+        is_jpg       = false;
     }
     // appending ".png" to absent or unknown extension
     if (!is_jpg && lc_ext != ".png") {
@@ -1099,7 +1158,7 @@ int main(int argc, const char* argv[]) {
             continue;
         }
         std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ext : dummy_name + ext;
-        if(is_jpg) {
+        if (is_jpg) {
             stbi_write_jpg(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
                            results[i].data, 90, get_image_params(params, params.seed + i).c_str());
             printf("save result JPEG image to '%s'\n", final_image_path.c_str());

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
@@ -627,7 +627,7 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const
     ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, tile_size * scale, tile_size * scale, output->ne[2], 1);
     on_processing(input_tile, NULL, true);
     int num_tiles = ceil((float)input_width / non_tile_overlap) * ceil((float)input_height / non_tile_overlap);
-    LOG_INFO("processing %i tiles", num_tiles);
+    LOG_DEBUG("processing %i tiles", num_tiles);
     pretty_progress(1, num_tiles, 0.0f);
     int tile_count = 1;
     bool last_y = false, last_x = false;

diff --git a/latent-preview.h b/latent-preview.h
@@ -0,0 +1,83 @@
+
+// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169
+const float flux_latent_rgb_proj[16][3] = {
+    {-0.0346f, 0.0244f, 0.0681f},
+    {0.0034f, 0.0210f, 0.0687f},
+    {0.0275f, -0.0668f, -0.0433f},
+    {-0.0174f, 0.0160f, 0.0617f},
+    {0.0859f, 0.0721f, 0.0329f},
+    {0.0004f, 0.0383f, 0.0115f},
+    {0.0405f, 0.0861f, 0.0915f},
+    {-0.0236f, -0.0185f, -0.0259f},
+    {-0.0245f, 0.0250f, 0.1180f},
+    {0.1008f, 0.0755f, -0.0421f},
+    {-0.0515f, 0.0201f, 0.0011f},
+    {0.0428f, -0.0012f, -0.0036f},
+    {0.0817f, 0.0765f, 0.0749f},
+    {-0.1264f, -0.0522f, -0.1103f},
+    {-0.0280f, -0.0881f, -0.0499f},
+    {-0.1262f, -0.0982f, -0.0778f}};
+
+// https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py#L228-L246
+const float sd3_latent_rgb_proj[16][3] = {
+    {-0.0645f, 0.0177f, 0.1052f},
+    {0.0028f, 0.0312f, 0.0650f},
+    {0.1848f, 0.0762f, 0.0360f},
+    {0.0944f, 0.0360f, 0.0889f},
+    {0.0897f, 0.0506f, -0.0364f},
+    {-0.0020f, 0.1203f, 0.0284f},
+    {0.0855f, 0.0118f, 0.0283f},
+    {-0.0539f, 0.0658f, 0.1047f},
+    {-0.0057f, 0.0116f, 0.0700f},
+    {-0.0412f, 0.0281f, -0.0039f},
+    {0.1106f, 0.1171f, 0.1220f},
+    {-0.0248f, 0.0682f, -0.0481f},
+    {0.0815f, 0.0846f, 0.1207f},
+    {-0.0120f, -0.0055f, -0.0867f},
+    {-0.0749f, -0.0634f, -0.0456f},
+    {-0.1418f, -0.1457f, -0.1259f},
+};
+
+// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38
+const float sdxl_latent_rgb_proj[4][3] = {
+    {0.3651f, 0.4232f, 0.4341f},
+    {-0.2533f, -0.0042f, 0.1068f},
+    {0.1076f, 0.1111f, -0.0362f},
+    {-0.3165f, -0.2492f, -0.2188f}};
+
+// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38
+const float sd_latent_rgb_proj[4][3]{
+    {0.3512f, 0.2297f, 0.3227f},
+    {0.3250f, 0.4974f, 0.2350f},
+    {-0.2829f, 0.1762f, 0.2721f},
+    {-0.2120f, -0.2616f, -0.7177f}};
+
+void preview_latent_image(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], int width, int height, int dim) {
+    size_t buffer_head = 0;
+    for (int j = 0; j < height; j++) {
+        for (int i = 0; i < width; i++) {
+            size_t latent_id = (i * latents->nb[0] + j * latents->nb[1]);
+            float r = 0, g = 0, b = 0;
+            for (int d = 0; d < dim; d++) {
+                float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[2]);
+                r += value * latent_rgb_proj[d][0];
+                g += value * latent_rgb_proj[d][1];
+                b += value * latent_rgb_proj[d][2];
+            }
+
+            // change range
+            r = r * .5f + .5f;
+            g = g * .5f + .5f;
+            b = b * .5f + .5f;
+
+            // clamp rgb values to [0,1] range
+            r = r >= 0 ? r <= 1 ? r : 1 : 0;
+            g = g >= 0 ? g <= 1 ? g : 1 : 0;
+            b = b >= 0 ? b <= 1 ? b : 1 : 0;
+
+            buffer[buffer_head++] = (uint8_t)(r * 255);
+            buffer[buffer_head++] = (uint8_t)(g * 255);
+            buffer[buffer_head++] = (uint8_t)(b * 255);
+        }
+    }
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,4 +10,5 @@ test/ @@
     *.gguf
     output*.png
     models*
-    *.log
+    *.log
+    preview.png