Skip to content

Reduce binary size of cuml.explainer kernels #7001

@jcrist

Description

@jcrist

Most cuml.explainer kernels are parameterized by model dtype (float32, float64) leading to 2 instances of each kernel. Some are also parameterized by the prediction type (float32, float64) leading to 4 instances of each kernel (2 of each of these are never used by cuml since we cast preds to match the model type).

Here are the main kernels in cuml.explainer today:

kernels.csv
kernel,bytes
"void ML::Explainer::update_perm_shap_values_kernel<double, int>(double*, double const*, int, int const*)",1132
"void ML::Explainer::update_perm_shap_values_kernel<float, int>(float*, float const*, int, int const*)",1132
"void ML::Explainer::exact_rows_kernel<double, int>(float*, int, int, double*, int, double*, double*)",2488
"void ML::Explainer::exact_rows_kernel<float, int>(float*, int, int, float*, int, float*, float*)",2488
"void ML::Explainer::_fused_tile_scatter_pe<double, int>(double*, double const*, int, int, double const*, int*, int, int, bool)",2913
"void ML::Explainer::_fused_tile_scatter_pe<float, int>(float*, float const*, int, int, float const*, int*, int, int, bool)",2913
"void ML::Explainer::sampled_rows_kernel<float, int>(int*, float*, int, int, float*, int, float*, float*, unsigned long)",13160
"void ML::Explainer::sampled_rows_kernel<double, int>(int*, float*, int, int, double*, int, double*, double*, unsigned long)",13288
"void gpu_treeshap::detail::ShapKernel<(anonymous namespace)::DenseDatasetWrapper<double>, 256ul, 1024ul, (anonymous namespace)::SplitCondition<float> >((anonymous namespace)::DenseDatasetWrapper<double>, unsigned long, gpu_treeshap::PathElement<(anonymous namespace)::SplitCondition<float> > const*, unsigned long const*, unsigned long, double*)",39620
"void gpu_treeshap::detail::ShapKernel<(anonymous namespace)::DenseDatasetWrapper<float>, 256ul, 1024ul, (anonymous namespace)::SplitCondition<float> >((anonymous namespace)::DenseDatasetWrapper<float>, unsigned long, gpu_treeshap::PathElement<(anonymous namespace)::SplitCondition<float> > const*, unsigned long const*, unsigned long, double*)",39620
"void gpu_treeshap::detail::ShapKernel<(anonymous namespace)::DenseDatasetWrapper<double>, 256ul, 1024ul, (anonymous namespace)::SplitCondition<double> >((anonymous namespace)::DenseDatasetWrapper<double>, unsigned long, gpu_treeshap::PathElement<(anonymous namespace)::SplitCondition<double> > const*, unsigned long const*, unsigned long, double*)",41668
"void gpu_treeshap::detail::ShapKernel<(anonymous namespace)::DenseDatasetWrapper<float>, 256ul, 1024ul, (anonymous namespace)::SplitCondition<double> >((anonymous namespace)::DenseDatasetWrapper<float>, unsigned long, gpu_treeshap::PathElement<(anonymous namespace)::SplitCondition<double> > const*, unsigned long const*, unsigned long, double*)",41668
"void gpu_treeshap::detail::ShapInterventionalKernel<(anonymous namespace)::DenseDatasetWrapper<double>, 256ul, 100ul, (anonymous namespace)::SplitCondition<float> >((anonymous namespace)::DenseDatasetWrapper<double>, (anonymous namespace)::DenseDatasetWrapper<double>, unsigned long, gpu_treeshap::PathElement<(anonymous namespace)::SplitCondition<float> > const*, unsigned long const*, unsigned long, double*)",54824
"void gpu_treeshap::detail::ShapInterventionalKernel<(anonymous namespace)::DenseDatasetWrapper<float>, 256ul, 100ul, (anonymous namespace)::SplitCondition<float> >((anonymous namespace)::DenseDatasetWrapper<float>, (anonymous namespace)::DenseDatasetWrapper<float>, unsigned long, gpu_treeshap::PathElement<(anonymous namespace)::SplitCondition<float> > const*, unsigned long const*, unsigned long, double*)",54824
"void gpu_treeshap::detail::ShapInterventionalKernel<(anonymous namespace)::DenseDatasetWrapper<double>, 256ul, 100ul, (anonymous namespace)::SplitCondition<double> >((anonymous namespace)::DenseDatasetWrapper<double>, (anonymous namespace)::DenseDatasetWrapper<double>, unsigned long, gpu_treeshap::PathElement<(anonymous namespace)::SplitCondition<double> > const*, unsigned long const*, unsigned long, double*)",57000
"void gpu_treeshap::detail::ShapTaylorInteractionsKernel<(anonymous namespace)::DenseDatasetWrapper<double>, 256ul, 100ul, (anonymous namespace)::SplitCondition<float> >((anonymous namespace)::DenseDatasetWrapper<double>, unsigned long, gpu_treeshap::PathElement<(anonymous namespace)::SplitCondition<float> > const*, unsigned long const*, unsigned long, double*)",57056
"void gpu_treeshap::detail::ShapTaylorInteractionsKernel<(anonymous namespace)::DenseDatasetWrapper<float>, 256ul, 100ul, (anonymous namespace)::SplitCondition<float> >((anonymous namespace)::DenseDatasetWrapper<float>, unsigned long, gpu_treeshap::PathElement<(anonymous namespace)::SplitCondition<float> > const*, unsigned long const*, unsigned long, double*)",57056
"void gpu_treeshap::detail::ShapInterventionalKernel<(anonymous namespace)::DenseDatasetWrapper<float>, 256ul, 100ul, (anonymous namespace)::SplitCondition<double> >((anonymous namespace)::DenseDatasetWrapper<float>, (anonymous namespace)::DenseDatasetWrapper<float>, unsigned long, gpu_treeshap::PathElement<(anonymous namespace)::SplitCondition<double> > const*, unsigned long const*, unsigned long, double*)",57128
"void gpu_treeshap::detail::ShapTaylorInteractionsKernel<(anonymous namespace)::DenseDatasetWrapper<double>, 256ul, 100ul, (anonymous namespace)::SplitCondition<double> >((anonymous namespace)::DenseDatasetWrapper<double>, unsigned long, gpu_treeshap::PathElement<(anonymous namespace)::SplitCondition<double> > const*, unsigned long const*, unsigned long, double*)",59104
"void gpu_treeshap::detail::ShapTaylorInteractionsKernel<(anonymous namespace)::DenseDatasetWrapper<float>, 256ul, 100ul, (anonymous namespace)::SplitCondition<double> >((anonymous namespace)::DenseDatasetWrapper<float>, unsigned long, gpu_treeshap::PathElement<(anonymous namespace)::SplitCondition<double> > const*, unsigned long const*, unsigned long, double*)",59104
"void gpu_treeshap::detail::ShapInteractionsKernel<(anonymous namespace)::DenseDatasetWrapper<double>, 256ul, 100ul, (anonymous namespace)::SplitCondition<float> >((anonymous namespace)::DenseDatasetWrapper<double>, unsigned long, gpu_treeshap::PathElement<(anonymous namespace)::SplitCondition<float> > const*, unsigned long const*, unsigned long, double*)",60032
"void gpu_treeshap::detail::ShapInteractionsKernel<(anonymous namespace)::DenseDatasetWrapper<float>, 256ul, 100ul, (anonymous namespace)::SplitCondition<float> >((anonymous namespace)::DenseDatasetWrapper<float>, unsigned long, gpu_treeshap::PathElement<(anonymous namespace)::SplitCondition<float> > const*, unsigned long const*, unsigned long, double*)",60032
"void gpu_treeshap::detail::ShapInteractionsKernel<(anonymous namespace)::DenseDatasetWrapper<double>, 256ul, 100ul, (anonymous namespace)::SplitCondition<double> >((anonymous namespace)::DenseDatasetWrapper<double>, unsigned long, gpu_treeshap::PathElement<(anonymous namespace)::SplitCondition<double> > const*, unsigned long const*, unsigned long, double*)",62592
"void gpu_treeshap::detail::ShapInteractionsKernel<(anonymous namespace)::DenseDatasetWrapper<float>, 256ul, 100ul, (anonymous namespace)::SplitCondition<double> >((anonymous namespace)::DenseDatasetWrapper<float>, unsigned long, gpu_treeshap::PathElement<(anonymous namespace)::SplitCondition<double> > const*, unsigned long const*, unsigned long, double*)",62720

By my calculations if we instead moved to only compiling for float32 (or float64) we'd shave off 0.64 MiB (for one SM). Maybe worth it, considering that cuml.explainer isn't as performance sensitive.

Metadata

Metadata

Assignees

Labels

No labels
No labels

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions