diff --git a/src/common/guided_filter.c b/src/common/guided_filter.c index fbb95c1b5758..855ec7512bc5 100644 --- a/src/common/guided_filter.c +++ b/src/common/guided_filter.c @@ -484,10 +484,8 @@ static int _guided_filter_cl_impl(int devid, const gboolean tiling = num_tiles > 1; // When should we avoid internal tiling and thus use CPU fallback code? - // Lets use advantage hint if provided or assume OpenCL is 10 times faster - const float hint = darktable.opencl->dev[devid].advantage; - const float advantage = hint > 1.0f ? 1.0f / hint : 0.1f; - const gboolean possible = ((float)valid_rows / (float)tile_height) > advantage; + // Lets assume OpenCL is 10 times faster + const gboolean possible = ((float)valid_rows / (float)tile_height) > 0.1f; if(tiling || (darktable.unmuted & DT_DEBUG_VERBOSE)) dt_print(DT_DEBUG_PIPE | DT_DEBUG_TILING, diff --git a/src/common/opencl.c b/src/common/opencl.c index 713ae0bc687c..ec53967bcda4 100644 --- a/src/common/opencl.c +++ b/src/common/opencl.c @@ -359,8 +359,8 @@ static void _opencl_write_device_config(const int devid) gchar key[256] = { 0 }; gchar dat[512] = { 0 }; - g_snprintf(key, 254, "%s%s", DT_CLDEVICE_HEAD, cl->dev[devid].cname); - g_snprintf(dat, 510, "%i %i %i %i %i %.3f %.3f", + g_snprintf(key, sizeof(key), "%s%s", DT_CLDEVICE_HEAD, cl->dev[devid].cname); + g_snprintf(dat, sizeof(dat), "%i %i %i %i %i %.3f %.3f", cl->dev[devid].micro_nap, cl->dev[devid].pinned_memory, @@ -368,17 +368,24 @@ static void _opencl_write_device_config(const int devid) cl->dev[devid].use_events ? 1 : 0, cl->dev[devid].asyncmode, cl->dev[devid].disabled, - cl->dev[devid].advantage, + 0.0f, cl->dev[devid].unified_fraction); dt_print_nts(DT_DEBUG_OPENCL | DT_DEBUG_VERBOSE, - "\n[opencl_write_device_config] writing data '%s' for '%s'\n", dat, key); + "[opencl_write_device_config] writing data '%s' for '%s'\n", dat, key); + dt_conf_set_string(key, dat); + + // write per device list of modules that should not use OpenCL + g_snprintf(key, sizeof(key), "%s%s_nocl", DT_CLDEVICE_HEAD, cl->dev[devid].cname); + g_snprintf(dat, sizeof(dat), "%s", cl->dev[devid].avoid ? cl->dev[devid].avoid : ""); + dt_print_nts(DT_DEBUG_OPENCL | DT_DEBUG_VERBOSE, + "[opencl_write_device_config] writing data '%s' for '%s'\n", dat, key); dt_conf_set_string(key, dat); // Also take care of extended device data, these are not only device // specific but also depend on the devid to support systems with two // similar cards. - g_snprintf(key, 254, "%s%s_id%i", DT_CLDEVICE_HEAD, cl->dev[devid].cname, devid); - g_snprintf(dat, 510, "%i", cl->dev[devid].headroom); + g_snprintf(key, sizeof(key), "%s%s_id%i", DT_CLDEVICE_HEAD, cl->dev[devid].cname, devid); + g_snprintf(dat, sizeof(dat), "%i", cl->dev[devid].headroom); dt_print_nts(DT_DEBUG_OPENCL | DT_DEBUG_VERBOSE, "[opencl_write_device_config] writing data '%s' for '%s'\n", dat, key); dt_conf_set_string(key, dat); @@ -413,7 +420,7 @@ static gboolean _opencl_read_device_config(const int devid) dt_opencl_t *cl = darktable.opencl; dt_opencl_device_t *cldid = &cl->dev[devid]; gchar key[256] = { 0 }; - g_snprintf(key, 254, "%s%s", DT_CLDEVICE_HEAD, cl->dev[devid].cname); + g_snprintf(key, sizeof(key), "%s%s", DT_CLDEVICE_HEAD, cl->dev[devid].cname); const gboolean existing_device = dt_conf_key_not_empty(key); gboolean safety_ok = TRUE; @@ -425,17 +432,16 @@ static gboolean _opencl_read_device_config(const int devid) int events; int asyncmode; int disabled; - float advantage; + float dummy; float unified_fraction; sscanf(dat, "%i %i %i %i %i %f %f", - µ_nap, &pinned_memory, &events, &asyncmode, &disabled, &advantage, &unified_fraction); + µ_nap, &pinned_memory, &events, &asyncmode, &disabled, &dummy, &unified_fraction); cldid->use_events = events ? TRUE : FALSE; cldid->micro_nap = micro_nap; cldid->pinned_memory = pinned_memory ? TRUE : FALSE; cldid->asyncmode = asyncmode ? TRUE : FALSE; cldid->disabled = disabled ? TRUE : FALSE; - cldid->advantage = advantage; cldid->unified_fraction = unified_fraction; } @@ -444,12 +450,14 @@ static gboolean _opencl_read_device_config(const int devid) cldid->unified_fraction = 0.25f; if((cldid->micro_nap < 0) || (cldid->micro_nap > 1000000)) cldid->micro_nap = 250; - if((cldid->advantage < 0.0f) || (cldid->advantage > 10000.0f)) - cldid->advantage = 0.0f; + + // Also read the per-device list of modules to be avoided for OpenCL + g_snprintf(key, sizeof(key), "%s%s_nocl", DT_CLDEVICE_HEAD, cl->dev[devid].cname); + cldid->avoid = dt_conf_key_not_empty(key) ? dt_conf_get_string(key) : NULL; // Also take care of extended device data, these are not only device // specific but also depend on the devid - g_snprintf(key, 254, "%s%s_id%i", DT_CLDEVICE_HEAD, cldid->cname, devid); + g_snprintf(key, sizeof(key), "%s%s_id%i", DT_CLDEVICE_HEAD, cldid->cname, devid); if(dt_conf_key_not_empty(key)) { const gchar *dat = dt_conf_get_string_const(key); @@ -516,6 +524,7 @@ static gboolean _opencl_device_init(dt_opencl_t *cl, cl->dev[dev].cname = NULL; cl->dev[dev].options = NULL; cl->dev[dev].cflags = NULL; + cl->dev[dev].avoid = NULL; cl->dev[dev].memory_in_use = 0; cl->dev[dev].peak_memory = 0; cl->dev[dev].used_available = 0; @@ -528,7 +537,6 @@ static gboolean _opencl_device_init(dt_opencl_t *cl, cl->dev[dev].clmem_error = FALSE; cl->dev[dev].clroundup_wd = 16; cl->dev[dev].clroundup_ht = 16; - cl->dev[dev].advantage = 0.0f; cl->dev[dev].use_events = TRUE; cl->dev[dev].asyncmode = FALSE; cl->dev[dev].disabled = FALSE; @@ -785,13 +793,16 @@ static gboolean _opencl_device_init(dt_opencl_t *cl, dt_print_nts(DT_DEBUG_OPENCL, " DEVICE VERSION: %s API=%s\n", cl->dev[dev].device_version, cl->api30 ? "300" : "120"); - dt_print_nts(DT_DEBUG_OPENCL, " DEVICE_TYPE: %s%s%s%s%s\n", + dt_print_nts(DT_DEBUG_OPENCL, " DEVICE_TYPE: %s%s%s%s%s", ((type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) ? "CPU" : "", ((type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) ? "GPU" : "", ((type & CL_DEVICE_TYPE_CUSTOM) == CL_DEVICE_TYPE_CUSTOM) ? "CUSTOM" : "", (type & CL_DEVICE_TYPE_ACCELERATOR) ? ", Accelerator" : "", unified_memory ? ", unified mem" : ", dedicated mem" ); + if(unified_memory) dt_print_nts(DT_DEBUG_OPENCL, " (%i%%)\n", (int)(100.f * cl->dev[dev].unified_fraction)); + else dt_print_nts(DT_DEBUG_OPENCL, "\n"); + if(is_custom_device && newdevice) { dt_print_nts(DT_DEBUG_OPENCL, @@ -853,8 +864,8 @@ static gboolean _opencl_device_init(dt_opencl_t *cl, if(cl->dev[dev].max_global_mem < (uint64_t)800ul * DT_MEGA) { dt_print_nts(DT_DEBUG_OPENCL, - " *** insufficient global memory (%" PRIu64 "MB) ***\n", - cl->dev[dev].max_global_mem / DT_MEGA); + " *** insufficient global memory %zu MB) ***\n", + (size_t)cl->dev[dev].max_global_mem / DT_MEGA); res = TRUE; cl->dev[dev].disabled |= TRUE; goto end; @@ -875,18 +886,15 @@ static gboolean _opencl_device_init(dt_opencl_t *cl, } dt_print_nts(DT_DEBUG_OPENCL, - " GLOBAL MEM SIZE: %.0f MB\n", - (double)cl->dev[dev].max_global_mem / (double)DT_MEGA); + " GLOBAL MEM SIZE: %zu MB\n", (size_t)(cl->dev[dev].max_global_mem / DT_MEGA)); dt_print_nts(DT_DEBUG_OPENCL, - " MAX IMAGE ALLOC: %.0f MB\n", - (double)cl->dev[dev].max_mem_alloc / (double)DT_MEGA); + " MAX IMAGE ALLOC: %zu MB\n", (size_t)(cl->dev[dev].max_mem_alloc / DT_MEGA)); dt_print_nts(DT_DEBUG_OPENCL, - " MAX IMAGE SIZE: %zd x %zd\n", - cl->dev[dev].max_image_width, cl->dev[dev].max_image_height); + " MAX IMAGE SIZE: %zu x %zu\n", cl->dev[dev].max_image_width, cl->dev[dev].max_image_height); dt_print_nts(DT_DEBUG_OPENCL, - " MAX CONSTANT BUFFER: %.0f KB\n", (double)cl->dev[dev].max_mem_constant / 1024.0); + " MAX CONSTANT BUFFER: %zu KB\n", (size_t)(cl->dev[dev].max_mem_constant / 1024)); dt_print_nts(DT_DEBUG_OPENCL, - " LOCAL MEM SIZE: %zu KB\n", cl->dev[dev].local_size / 1024lu); + " LOCAL MEM SIZE: %zu KB\n", (size_t)(cl->dev[dev].local_size / 1024)); dt_print_nts(DT_DEBUG_OPENCL, " ADDRESS ALIGN: %d B\n", cl->dev[dev].alignsize / 8); dt_print_nts(DT_DEBUG_OPENCL, @@ -961,10 +969,10 @@ static gboolean _opencl_device_init(dt_opencl_t *cl, " EVENTS HANDLED: %s\n", STR_YESNO(cl->dev[dev].use_events)); dt_print_nts(DT_DEBUG_OPENCL, " OPENCL FAST MODE: %s\n", STR_YESNO(fastopencl)); - dt_print_nts(DT_DEBUG_OPENCL, - " TILING ADVANTAGE: %.3f\n", cl->dev[dev].advantage); dt_print_nts(DT_DEBUG_OPENCL, " DEFAULT DEVICE: %s\n", STR_YESNO(type & CL_DEVICE_TYPE_DEFAULT)); + dt_print_nts(DT_DEBUG_OPENCL, + " AVOIDED MODULES: %s\n", cl->dev[dev].avoid ? cl->dev[dev].avoid : "none"); if(cl->dev[dev].disabled) { @@ -1221,6 +1229,7 @@ static void _cleanup_cl_device_mem(dt_opencl_t *cl, const int i) free((void *)(cl->dev[i].cname)); free((void *)(cl->dev[i].options)); free((void *)(cl->dev[i].cflags)); + g_free((void *)(cl->dev[i].avoid)); } void dt_opencl_init(dt_opencl_t *cl, @@ -1607,7 +1616,7 @@ void dt_opencl_init(dt_opencl_t *cl, dt_opencl_scheduling_profile_t profile = _opencl_get_scheduling_profile(); _opencl_apply_scheduling_profile(profile); - // let's keep track on unified memory devices + // let's report unified memory per device dt_sys_resources_t *res = &darktable.dtresources; for(int i = 0; i < cl->num_devs; i++) { @@ -3598,9 +3607,9 @@ void dt_opencl_memory_statistics(int devid, { dt_print(DT_DEBUG_OPENCL,"[opencl memory] device '%s' id=%d: %.1fMB in use, %.1fMB available GPU mem of %.1fMB", cl->dev[devid].fullname, devid, - (float)cl->dev[devid].memory_in_use/(1024*1024), - (float)cl->dev[devid].used_available/(1024*1024), - (float)cl->dev[devid].max_global_mem/(1024*1024)); + (float)cl->dev[devid].memory_in_use / DT_MEGA, + (float)cl->dev[devid].used_available / DT_MEGA, + (float)cl->dev[devid].max_global_mem / DT_MEGA); if(cl->dev[devid].memory_in_use > darktable.opencl->dev[devid].used_available) { dt_print(DT_DEBUG_OPENCL, diff --git a/src/common/opencl.h b/src/common/opencl.h index 34a188d8abaa..c4cb33d40720 100644 --- a/src/common/opencl.h +++ b/src/common/opencl.h @@ -151,6 +151,7 @@ typedef struct dt_opencl_device_t const char *cname; const char *options; const char *cflags; + const char *avoid; cl_int summary; size_t memory_in_use; size_t peak_memory; @@ -211,8 +212,6 @@ typedef struct dt_opencl_device_t // lets keep the vendor for runtime checks int vendor_id; - - float advantage; } dt_opencl_device_t; struct dt_bilateral_cl_global_t; diff --git a/src/develop/pixelpipe_hb.c b/src/develop/pixelpipe_hb.c index 599b0a4fc84f..0675bc4472e5 100644 --- a/src/develop/pixelpipe_hb.c +++ b/src/develop/pixelpipe_hb.c @@ -1694,6 +1694,13 @@ static void _opencl_dump_diff_pipe_pfm(dt_dev_pixelpipe_t *pipe, dt_free_align(clin); } } + +static inline gboolean _avoid_cl_module(const dt_dev_pixelpipe_iop_t *piece) +{ + const dt_opencl_device_t *cldid = &darktable.opencl->dev[piece->pipe->devid]; + return cldid->avoid && dt_str_commasubstring(cldid->avoid, piece->module->op); +} + #endif static inline gboolean _skip_piece_on_tags(const dt_dev_pixelpipe_iop_t *piece) @@ -2140,7 +2147,8 @@ static gboolean _dev_pixelpipe_process_rec(dt_dev_pixelpipe_t *pipe, gboolean possible_cl = module->process_cl && piece->process_cl_ready - && !(dt_pipe_is_preview(pipe) && (module->flags() & IOP_FLAGS_PREVIEW_NON_OPENCL)); + && !(dt_pipe_is_preview(pipe) && (module->flags() & IOP_FLAGS_PREVIEW_NON_OPENCL)) + && !_avoid_cl_module(piece); const uint32_t m_bpp = MAX(in_bpp, bpp); const size_t m_width = MAX(roi_in.width, roi_out->width); @@ -2154,24 +2162,6 @@ static gboolean _dev_pixelpipe_process_rec(dt_dev_pixelpipe_t *pipe, { if(!_piece_may_tile(piece)) possible_cl = FALSE; - - const float advantage = darktable.opencl->dev[pipe->devid].advantage; - if(possible_cl && (advantage > 0.0f)) - { - const float tilemem_cl = dt_tiling_estimate_clmem(&tiling, piece, - &roi_in, roi_out, m_bpp); - const float tilemem_cpu = dt_tiling_estimate_cpumem(&tiling, piece, - &roi_in, roi_out, m_bpp); - if((tilemem_cpu * advantage) < tilemem_cl) - { - dt_print(DT_DEBUG_OPENCL | DT_DEBUG_TILING, - "[dt_dev_pixelpipetiling_cl] [%s] estimates cpu" - " advantage in `%s', (dev=%i, adv=%.2f, GPU %.2f CPU %.2f)", - dt_dev_pixelpipe_type_to_str(pipe->type), module->op, pipe->devid, - advantage, tilemem_cl / 1e9, tilemem_cpu / 1e9); - possible_cl = FALSE; - } - } } if(possible_cl) diff --git a/src/develop/tiling.c b/src/develop/tiling.c index 8a5f9d9f82e6..af24e6311d2b 100644 --- a/src/develop/tiling.c +++ b/src/develop/tiling.c @@ -1181,129 +1181,7 @@ void default_process_tiling(dt_iop_module_t *self, return; } -float dt_tiling_estimate_cpumem(const dt_develop_tiling_t *tiling, - const dt_dev_pixelpipe_iop_t *piece, - const dt_iop_roi_t *const roi_in, - const dt_iop_roi_t *const roi_out, - const int max_bpp) -{ - const int m_dx = MAX(roi_in->width, roi_out->width); - const int m_dy = MAX(roi_in->height, roi_out->height); - if(dt_tiling_piece_fits_host_memory(piece, m_dx, m_dy, max_bpp, tiling->factor, tiling->overhead)) - return (float)m_dx * m_dy * max_bpp * tiling->factor + tiling->overhead; - - const float fullscale = fmaxf(roi_in->scale / roi_out->scale, sqrtf(((float)roi_in->width * roi_in->height) - / ((float)roi_out->width * roi_out->height))); - float available = dt_get_available_pipe_mem(piece->pipe); - available = fmaxf(available - ((float)roi_out->width * roi_out->height * max_bpp) - - ((float)roi_in->width * roi_in->height * max_bpp) - tiling->overhead, 0.0f); - - float singlebuffer = dt_get_singlebuffer_mem(); - const float factor = fmaxf(tiling->factor, 1.0f); - const float maxbuf = fmaxf(tiling->maxbuf, 1.0f); - singlebuffer = fmaxf(available / factor, singlebuffer); - - int width = MAX(roi_in->width, roi_out->width); - int height = MAX(roi_in->height, roi_out->height); - - const unsigned int align = tiling->align; - if((float)width * height * max_bpp * maxbuf > singlebuffer) - { - const float scale = singlebuffer / ((float)width * height * max_bpp * maxbuf); - if(width < height && scale >= 0.333f) - height = _align_down((int)floorf(height * scale), align); - else if(height <= width && scale >= 0.333f) - width = _align_down((int)floorf(width * scale), align); - else - { - width = _align_down((int)floorf(width * sqrtf(scale)), align); - height = _align_down((int)floorf(height * sqrtf(scale)), align); - } - } - - if(3 * tiling->overlap > width || 3 * tiling->overlap > height) - width = height = _align_down((int)floorf(sqrtf((float)width * height)), align); - const int overlap_in = _align_up(tiling->overlap, align); - const int overlap_out = ceilf((float)overlap_in / fullscale); - - int tiles_x = 1, tiles_y = 1; - - if(roi_in->width > roi_out->width) - tiles_x = (width < roi_in->width) ? ceilf((float)roi_in->width / (float)MAX(width - 2 * overlap_in, 1)) : 1; - else - tiles_x = (width < roi_out->width) ? ceilf((float)roi_out->width / (float)MAX(width - 2 * overlap_out, 1)) : 1; - - if(roi_in->height > roi_out->height) - tiles_y = (height < roi_in->height) ? ceilf((float)roi_in->height / (float)MAX(height - 2 * overlap_in, 1)) : 1; - else - tiles_y = (height < roi_out->height) ? ceilf((float)roi_out->height / (float)MAX(height - 2 * overlap_out, 1)) : 1; - dt_print(DT_DEBUG_TILING, "tilex = %i, tiley = %i", tiles_x, tiles_y); - return (float)tiles_x * tiles_y * singlebuffer ; -} - #ifdef HAVE_OPENCL -float dt_tiling_estimate_clmem(const dt_develop_tiling_t *tiling, - const dt_dev_pixelpipe_iop_t *piece, - const dt_iop_roi_t *const roi_in, - const dt_iop_roi_t *const roi_out, - const int max_bpp) -{ - const int devid = piece->pipe->devid; - const float fullscale = fmaxf(roi_in->scale / roi_out->scale, sqrtf(((float)roi_in->width * roi_in->height) - / ((float)roi_out->width * roi_out->height))); - const gboolean use_pinned_memory = dt_opencl_use_pinned_memory(devid); - /* If using pinned transfer on devices with dedicated GPU mem there is an additional - mem pressure as they will allocate also on device as cache for performance - */ - const float pinned_buffer_overhead = use_pinned_memory && !dt_opencl_unified_memory(devid) ? 2.0f : 0.0f; - const float pinned_buffer_slack = use_pinned_memory ? 0.85f : 1.0f; - const float available = (float)dt_opencl_get_device_available(devid); - const float factor = fmaxf(tiling->factor_cl + pinned_buffer_overhead, 1.0f); - const float singlebuffer = fminf(fmaxf((available - tiling->overhead) / factor, 0.0f), - pinned_buffer_slack * (float)(dt_opencl_get_device_memalloc(devid))); - const float maxbuf = fmaxf(tiling->maxbuf_cl, 1.0f); - - int width = MIN(MAX(roi_in->width, roi_out->width), darktable.opencl->dev[devid].max_image_width); - int height = MIN(MAX(roi_in->height, roi_out->height), darktable.opencl->dev[devid].max_image_height); - - const unsigned int align = _lcm(tiling->align, dt_opencl_tiling_align(devid)); - - if((float)width * height * max_bpp * maxbuf > singlebuffer) - { - const float scale = singlebuffer / ((float)width * height * max_bpp * maxbuf); - - if(width < height && scale >= 0.333f) - height = _align_down((int)floorf(height * scale), align); - else if(height <= width && scale >= 0.333f) - width = _align_down((int)floorf(width * scale), align); - else - { - width = _align_down((int)floorf(width * sqrtf(scale)), align); - height = _align_down((int)floorf(height * sqrtf(scale)), align); - } - } - - if(3 * tiling->overlap > width || 3 * tiling->overlap > height) - width = height = _align_down((int)floorf(sqrtf((float)width * height)), align); - - const int overlap_in = _align_up(tiling->overlap, align); - const int overlap_out = ceilf((float)overlap_in / fullscale); - - int tiles_x = 1, tiles_y = 1; - - if(roi_in->width > roi_out->width) - tiles_x = (width < roi_in->width) ? ceilf((float)roi_in->width / (float)MAX(width - 2 * overlap_in, 1)) : 1; - else - tiles_x = (width < roi_out->width) ? ceilf((float)roi_out->width / (float)MAX(width - 2 * overlap_out, 1)) : 1; - - if(roi_in->height > roi_out->height) - tiles_y = (height < roi_in->height) ? ceilf((float)roi_in->height / (float)MAX(height - 2 * overlap_in, 1)) : 1; - else - tiles_y = (height < roi_out->height) ? ceilf((float)roi_out->height / (float)MAX(height - 2 * overlap_out, 1)) : 1; - - return (float)tiles_x * tiles_y * singlebuffer * factor; -} - /* simple tiling algorithm for roi_in == roi_out, i.e. for pixel to pixel modules/operations */ static int _default_process_tiling_cl_ptp(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, diff --git a/src/develop/tiling.h b/src/develop/tiling.h index 8e5100f3fe2b..774e229b7ede 100644 --- a/src/develop/tiling.h +++ b/src/develop/tiling.h @@ -71,15 +71,6 @@ void tiling_callback(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t gboolean dt_tiling_piece_fits_host_memory(const struct dt_dev_pixelpipe_iop_t *piece, const size_t width, const size_t height, const unsigned bpp, const float factor, const size_t overhead); -float dt_tiling_estimate_cpumem(const dt_develop_tiling_t *tiling, const struct dt_dev_pixelpipe_iop_t *piece, - const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out, - const int max_bpp); - -#ifdef HAVE_OPENCL -float dt_tiling_estimate_clmem(const dt_develop_tiling_t *tiling, const struct dt_dev_pixelpipe_iop_t *piece, - const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out, - const int max_bpp); -#endif // clang-format off // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py // vim: shiftwidth=2 expandtab tabstop=2 cindent