Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions src/common/guided_filter.c
Original file line number Diff line number Diff line change
Expand Up @@ -484,10 +484,8 @@ static int _guided_filter_cl_impl(int devid,
const gboolean tiling = num_tiles > 1;

// When should we avoid internal tiling and thus use CPU fallback code?
// Lets use advantage hint if provided or assume OpenCL is 10 times faster
const float hint = darktable.opencl->dev[devid].advantage;
const float advantage = hint > 1.0f ? 1.0f / hint : 0.1f;
const gboolean possible = ((float)valid_rows / (float)tile_height) > advantage;
// Lets assume OpenCL is 10 times faster
const gboolean possible = ((float)valid_rows / (float)tile_height) > 0.1f;

if(tiling || (darktable.unmuted & DT_DEBUG_VERBOSE))
dt_print(DT_DEBUG_PIPE | DT_DEBUG_TILING,
Expand Down
71 changes: 40 additions & 31 deletions src/common/opencl.c
Original file line number Diff line number Diff line change
Expand Up @@ -359,26 +359,33 @@ static void _opencl_write_device_config(const int devid)

gchar key[256] = { 0 };
gchar dat[512] = { 0 };
g_snprintf(key, 254, "%s%s", DT_CLDEVICE_HEAD, cl->dev[devid].cname);
g_snprintf(dat, 510, "%i %i %i %i %i %.3f %.3f",
g_snprintf(key, sizeof(key), "%s%s", DT_CLDEVICE_HEAD, cl->dev[devid].cname);
g_snprintf(dat, sizeof(dat), "%i %i %i %i %i %.3f %.3f",
cl->dev[devid].micro_nap,
cl->dev[devid].pinned_memory,

// this used to define the number of slots, now a bool and using DT_OPENCL_EVENTS if true
cl->dev[devid].use_events ? 1 : 0,
cl->dev[devid].asyncmode,
cl->dev[devid].disabled,
cl->dev[devid].advantage,
0.0f,
cl->dev[devid].unified_fraction);
dt_print_nts(DT_DEBUG_OPENCL | DT_DEBUG_VERBOSE,
"\n[opencl_write_device_config] writing data '%s' for '%s'\n", dat, key);
"[opencl_write_device_config] writing data '%s' for '%s'\n", dat, key);
dt_conf_set_string(key, dat);

// write per device list of modules that should not use OpenCL
g_snprintf(key, sizeof(key), "%s%s_nocl", DT_CLDEVICE_HEAD, cl->dev[devid].cname);
g_snprintf(dat, sizeof(dat), "%s", cl->dev[devid].avoid ? cl->dev[devid].avoid : "");
dt_print_nts(DT_DEBUG_OPENCL | DT_DEBUG_VERBOSE,
"[opencl_write_device_config] writing data '%s' for '%s'\n", dat, key);
dt_conf_set_string(key, dat);

// Also take care of extended device data, these are not only device
// specific but also depend on the devid to support systems with two
// similar cards.
g_snprintf(key, 254, "%s%s_id%i", DT_CLDEVICE_HEAD, cl->dev[devid].cname, devid);
g_snprintf(dat, 510, "%i", cl->dev[devid].headroom);
g_snprintf(key, sizeof(key), "%s%s_id%i", DT_CLDEVICE_HEAD, cl->dev[devid].cname, devid);
g_snprintf(dat, sizeof(dat), "%i", cl->dev[devid].headroom);
dt_print_nts(DT_DEBUG_OPENCL | DT_DEBUG_VERBOSE,
"[opencl_write_device_config] writing data '%s' for '%s'\n", dat, key);
dt_conf_set_string(key, dat);
Expand Down Expand Up @@ -413,7 +420,7 @@ static gboolean _opencl_read_device_config(const int devid)
dt_opencl_t *cl = darktable.opencl;
dt_opencl_device_t *cldid = &cl->dev[devid];
gchar key[256] = { 0 };
g_snprintf(key, 254, "%s%s", DT_CLDEVICE_HEAD, cl->dev[devid].cname);
g_snprintf(key, sizeof(key), "%s%s", DT_CLDEVICE_HEAD, cl->dev[devid].cname);

const gboolean existing_device = dt_conf_key_not_empty(key);
gboolean safety_ok = TRUE;
Expand All @@ -425,17 +432,16 @@ static gboolean _opencl_read_device_config(const int devid)
int events;
int asyncmode;
int disabled;
float advantage;
float dummy;
float unified_fraction;
sscanf(dat, "%i %i %i %i %i %f %f",
&micro_nap, &pinned_memory, &events, &asyncmode, &disabled, &advantage, &unified_fraction);
&micro_nap, &pinned_memory, &events, &asyncmode, &disabled, &dummy, &unified_fraction);

cldid->use_events = events ? TRUE : FALSE;
cldid->micro_nap = micro_nap;
cldid->pinned_memory = pinned_memory ? TRUE : FALSE;
cldid->asyncmode = asyncmode ? TRUE : FALSE;
cldid->disabled = disabled ? TRUE : FALSE;
cldid->advantage = advantage;
cldid->unified_fraction = unified_fraction;
}

Expand All @@ -444,12 +450,14 @@ static gboolean _opencl_read_device_config(const int devid)
cldid->unified_fraction = 0.25f;
if((cldid->micro_nap < 0) || (cldid->micro_nap > 1000000))
cldid->micro_nap = 250;
if((cldid->advantage < 0.0f) || (cldid->advantage > 10000.0f))
cldid->advantage = 0.0f;

// Also read the per-device list of modules to be avoided for OpenCL
g_snprintf(key, sizeof(key), "%s%s_nocl", DT_CLDEVICE_HEAD, cl->dev[devid].cname);
cldid->avoid = dt_conf_key_not_empty(key) ? dt_conf_get_string(key) : NULL;

// Also take care of extended device data, these are not only device
// specific but also depend on the devid
g_snprintf(key, 254, "%s%s_id%i", DT_CLDEVICE_HEAD, cldid->cname, devid);
g_snprintf(key, sizeof(key), "%s%s_id%i", DT_CLDEVICE_HEAD, cldid->cname, devid);
if(dt_conf_key_not_empty(key))
{
const gchar *dat = dt_conf_get_string_const(key);
Expand Down Expand Up @@ -516,6 +524,7 @@ static gboolean _opencl_device_init(dt_opencl_t *cl,
cl->dev[dev].cname = NULL;
cl->dev[dev].options = NULL;
cl->dev[dev].cflags = NULL;
cl->dev[dev].avoid = NULL;
cl->dev[dev].memory_in_use = 0;
cl->dev[dev].peak_memory = 0;
cl->dev[dev].used_available = 0;
Expand All @@ -528,7 +537,6 @@ static gboolean _opencl_device_init(dt_opencl_t *cl,
cl->dev[dev].clmem_error = FALSE;
cl->dev[dev].clroundup_wd = 16;
cl->dev[dev].clroundup_ht = 16;
cl->dev[dev].advantage = 0.0f;
cl->dev[dev].use_events = TRUE;
cl->dev[dev].asyncmode = FALSE;
cl->dev[dev].disabled = FALSE;
Expand Down Expand Up @@ -785,13 +793,16 @@ static gboolean _opencl_device_init(dt_opencl_t *cl,
dt_print_nts(DT_DEBUG_OPENCL, " DEVICE VERSION: %s API=%s\n",
cl->dev[dev].device_version,
cl->api30 ? "300" : "120");
dt_print_nts(DT_DEBUG_OPENCL, " DEVICE_TYPE: %s%s%s%s%s\n",
dt_print_nts(DT_DEBUG_OPENCL, " DEVICE_TYPE: %s%s%s%s%s",
((type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) ? "CPU" : "",
((type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) ? "GPU" : "",
((type & CL_DEVICE_TYPE_CUSTOM) == CL_DEVICE_TYPE_CUSTOM) ? "CUSTOM" : "",
(type & CL_DEVICE_TYPE_ACCELERATOR) ? ", Accelerator" : "",
unified_memory ? ", unified mem" : ", dedicated mem" );

if(unified_memory) dt_print_nts(DT_DEBUG_OPENCL, " (%i%%)\n", (int)(100.f * cl->dev[dev].unified_fraction));
else dt_print_nts(DT_DEBUG_OPENCL, "\n");

if(is_custom_device && newdevice)
{
dt_print_nts(DT_DEBUG_OPENCL,
Expand Down Expand Up @@ -853,8 +864,8 @@ static gboolean _opencl_device_init(dt_opencl_t *cl,
if(cl->dev[dev].max_global_mem < (uint64_t)800ul * DT_MEGA)
{
dt_print_nts(DT_DEBUG_OPENCL,
" *** insufficient global memory (%" PRIu64 "MB) ***\n",
cl->dev[dev].max_global_mem / DT_MEGA);
" *** insufficient global memory %zu MB) ***\n",
(size_t)cl->dev[dev].max_global_mem / DT_MEGA);
res = TRUE;
cl->dev[dev].disabled |= TRUE;
goto end;
Expand All @@ -875,18 +886,15 @@ static gboolean _opencl_device_init(dt_opencl_t *cl,
}

dt_print_nts(DT_DEBUG_OPENCL,
" GLOBAL MEM SIZE: %.0f MB\n",
(double)cl->dev[dev].max_global_mem / (double)DT_MEGA);
" GLOBAL MEM SIZE: %zu MB\n", (size_t)(cl->dev[dev].max_global_mem / DT_MEGA));
dt_print_nts(DT_DEBUG_OPENCL,
" MAX IMAGE ALLOC: %.0f MB\n",
(double)cl->dev[dev].max_mem_alloc / (double)DT_MEGA);
" MAX IMAGE ALLOC: %zu MB\n", (size_t)(cl->dev[dev].max_mem_alloc / DT_MEGA));
dt_print_nts(DT_DEBUG_OPENCL,
" MAX IMAGE SIZE: %zd x %zd\n",
cl->dev[dev].max_image_width, cl->dev[dev].max_image_height);
" MAX IMAGE SIZE: %zu x %zu\n", cl->dev[dev].max_image_width, cl->dev[dev].max_image_height);
dt_print_nts(DT_DEBUG_OPENCL,
" MAX CONSTANT BUFFER: %.0f KB\n", (double)cl->dev[dev].max_mem_constant / 1024.0);
" MAX CONSTANT BUFFER: %zu KB\n", (size_t)(cl->dev[dev].max_mem_constant / 1024));
dt_print_nts(DT_DEBUG_OPENCL,
" LOCAL MEM SIZE: %zu KB\n", cl->dev[dev].local_size / 1024lu);
" LOCAL MEM SIZE: %zu KB\n", (size_t)(cl->dev[dev].local_size / 1024));
dt_print_nts(DT_DEBUG_OPENCL,
" ADDRESS ALIGN: %d B\n", cl->dev[dev].alignsize / 8);
dt_print_nts(DT_DEBUG_OPENCL,
Expand Down Expand Up @@ -961,10 +969,10 @@ static gboolean _opencl_device_init(dt_opencl_t *cl,
" EVENTS HANDLED: %s\n", STR_YESNO(cl->dev[dev].use_events));
dt_print_nts(DT_DEBUG_OPENCL,
" OPENCL FAST MODE: %s\n", STR_YESNO(fastopencl));
dt_print_nts(DT_DEBUG_OPENCL,
" TILING ADVANTAGE: %.3f\n", cl->dev[dev].advantage);
dt_print_nts(DT_DEBUG_OPENCL,
" DEFAULT DEVICE: %s\n", STR_YESNO(type & CL_DEVICE_TYPE_DEFAULT));
dt_print_nts(DT_DEBUG_OPENCL,
" AVOIDED MODULES: %s\n", cl->dev[dev].avoid ? cl->dev[dev].avoid : "none");

if(cl->dev[dev].disabled)
{
Expand Down Expand Up @@ -1221,6 +1229,7 @@ static void _cleanup_cl_device_mem(dt_opencl_t *cl, const int i)
free((void *)(cl->dev[i].cname));
free((void *)(cl->dev[i].options));
free((void *)(cl->dev[i].cflags));
g_free((void *)(cl->dev[i].avoid));
}

void dt_opencl_init(dt_opencl_t *cl,
Expand Down Expand Up @@ -1607,7 +1616,7 @@ void dt_opencl_init(dt_opencl_t *cl,
dt_opencl_scheduling_profile_t profile = _opencl_get_scheduling_profile();
_opencl_apply_scheduling_profile(profile);

// let's keep track on unified memory devices
// let's report unified memory per device
dt_sys_resources_t *res = &darktable.dtresources;
for(int i = 0; i < cl->num_devs; i++)
{
Expand Down Expand Up @@ -3598,9 +3607,9 @@ void dt_opencl_memory_statistics(int devid,
{
dt_print(DT_DEBUG_OPENCL,"[opencl memory] device '%s' id=%d: %.1fMB in use, %.1fMB available GPU mem of %.1fMB",
cl->dev[devid].fullname, devid,
(float)cl->dev[devid].memory_in_use/(1024*1024),
(float)cl->dev[devid].used_available/(1024*1024),
(float)cl->dev[devid].max_global_mem/(1024*1024));
(float)cl->dev[devid].memory_in_use / DT_MEGA,
(float)cl->dev[devid].used_available / DT_MEGA,
(float)cl->dev[devid].max_global_mem / DT_MEGA);
if(cl->dev[devid].memory_in_use > darktable.opencl->dev[devid].used_available)
{
dt_print(DT_DEBUG_OPENCL,
Expand Down
3 changes: 1 addition & 2 deletions src/common/opencl.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ typedef struct dt_opencl_device_t
const char *cname;
const char *options;
const char *cflags;
const char *avoid;
cl_int summary;
size_t memory_in_use;
size_t peak_memory;
Expand Down Expand Up @@ -211,8 +212,6 @@ typedef struct dt_opencl_device_t

// lets keep the vendor for runtime checks
int vendor_id;

float advantage;
} dt_opencl_device_t;

struct dt_bilateral_cl_global_t;
Expand Down
28 changes: 9 additions & 19 deletions src/develop/pixelpipe_hb.c
Original file line number Diff line number Diff line change
Expand Up @@ -1694,6 +1694,13 @@ static void _opencl_dump_diff_pipe_pfm(dt_dev_pixelpipe_t *pipe,
dt_free_align(clin);
}
}

static inline gboolean _avoid_cl_module(const dt_dev_pixelpipe_iop_t *piece)
{
const dt_opencl_device_t *cldid = &darktable.opencl->dev[piece->pipe->devid];
return cldid->avoid && dt_str_commasubstring(cldid->avoid, piece->module->op);
}

#endif

static inline gboolean _skip_piece_on_tags(const dt_dev_pixelpipe_iop_t *piece)
Expand Down Expand Up @@ -2140,7 +2147,8 @@ static gboolean _dev_pixelpipe_process_rec(dt_dev_pixelpipe_t *pipe,
gboolean possible_cl =
module->process_cl
&& piece->process_cl_ready
&& !(dt_pipe_is_preview(pipe) && (module->flags() & IOP_FLAGS_PREVIEW_NON_OPENCL));
&& !(dt_pipe_is_preview(pipe) && (module->flags() & IOP_FLAGS_PREVIEW_NON_OPENCL))
&& !_avoid_cl_module(piece);

const uint32_t m_bpp = MAX(in_bpp, bpp);
const size_t m_width = MAX(roi_in.width, roi_out->width);
Expand All @@ -2154,24 +2162,6 @@ static gboolean _dev_pixelpipe_process_rec(dt_dev_pixelpipe_t *pipe,
{
if(!_piece_may_tile(piece))
possible_cl = FALSE;

const float advantage = darktable.opencl->dev[pipe->devid].advantage;
if(possible_cl && (advantage > 0.0f))
{
const float tilemem_cl = dt_tiling_estimate_clmem(&tiling, piece,
&roi_in, roi_out, m_bpp);
const float tilemem_cpu = dt_tiling_estimate_cpumem(&tiling, piece,
&roi_in, roi_out, m_bpp);
if((tilemem_cpu * advantage) < tilemem_cl)
{
dt_print(DT_DEBUG_OPENCL | DT_DEBUG_TILING,
"[dt_dev_pixelpipetiling_cl] [%s] estimates cpu"
" advantage in `%s', (dev=%i, adv=%.2f, GPU %.2f CPU %.2f)",
dt_dev_pixelpipe_type_to_str(pipe->type), module->op, pipe->devid,
advantage, tilemem_cl / 1e9, tilemem_cpu / 1e9);
possible_cl = FALSE;
}
}
}

if(possible_cl)
Expand Down
Loading
Loading