[oneDPL][ranges][merge] support size limit for output; fixes for __par_backend::__parallel_for

MikeDvorskiy · MikeDvorskiy · commit 006621002a4f · 2024-11-28T12:55:47.000+01:00
diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h
@@ -2948,7 +2948,7 @@ __pattern_remove_if(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec,
 // merge
 //------------------------------------------------------------------------
 
-template<std::random_access_iterator It1, std::random_access_iterator It2, std::random_access_iterator ItOut, typename _Comp>
+template<typename It1, typename It2, typename ItOut, typename _Comp>
 std::pair<It1, It2>
 __brick_merge_2(It1 __it_1, It1 __it_1_e, It2 __it_2, It2 __it_2_e, ItOut __it_out, ItOut __it_out_e, _Comp __comp,
               /* __is_vector = */ std::false_type)
@@ -3082,8 +3082,8 @@ __pattern_merge_2(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _It1 __i
 
                                             //serial merge n elements, starting from input x and y, to [i, j) output range
                                             auto __res = __brick_merge_2(__it_1 + __r, __it_1 + __n_1,
-                                                                       __it_2 + __c, __it_2 + __n_2,
-                                                                       __it_out + __i, __it_out + __j, __comp, _IsVector{});
+                                                                         __it_2 + __c, __it_2 + __n_2,
+                                                                         __it_out + __i, __it_out + __j, __comp, _IsVector{});
 
                                             if(__j == __n_out)
                                             {
diff --git a/include/oneapi/dpl/pstl/omp/parallel_for.h b/include/oneapi/dpl/pstl/omp/parallel_for.h
@@ -29,10 +29,10 @@ namespace __omp_backend
 
 template <class _Index, class _Fp>
 void
-__parallel_for_body(_Index __first, _Index __last, _Fp __f)
+__parallel_for_body(_Index __first, _Index __last, _Fp __f, std::size_t __grainsize)
 {
     // initial partition of the iteration space into chunks
-    auto __policy = oneapi::dpl::__omp_backend::__chunk_partitioner(__first, __last);
+    auto __policy = oneapi::dpl::__omp_backend::__chunk_partitioner(__first, __last, __grainsize);
 
     // To avoid over-subscription we use taskloop for the nested parallelism
     _PSTL_PRAGMA(omp taskloop untied mergeable)
@@ -49,20 +49,21 @@ __parallel_for_body(_Index __first, _Index __last, _Fp __f)
 
 template <class _ExecutionPolicy, class _Index, class _Fp>
 void
-__parallel_for(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPolicy&&, _Index __first, _Index __last, _Fp __f)
+__parallel_for(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPolicy&&, _Index __first, _Index __last, _Fp __f,
+               std::size_t __grainsize = __default_chunk_size)
 {
     if (omp_in_parallel())
     {
         // we don't create a nested parallel region in an existing parallel
         // region: just create tasks
-        oneapi::dpl::__omp_backend::__parallel_for_body(__first, __last, __f);
+        oneapi::dpl::__omp_backend::__parallel_for_body(__first, __last, __f, __grainsize);
     }
     else
     {
         // in any case (nested or non-nested) one parallel region is created and
         // only one thread creates a set of tasks
         _PSTL_PRAGMA(omp parallel)
-        _PSTL_PRAGMA(omp single nowait) { oneapi::dpl::__omp_backend::__parallel_for_body(__first, __last, __f); }
+        _PSTL_PRAGMA(omp single nowait) { oneapi::dpl::__omp_backend::__parallel_for_body(__first, __last, __f, __grainsize); }
     }
 }
 
diff --git a/include/oneapi/dpl/pstl/parallel_backend.h b/include/oneapi/dpl/pstl/parallel_backend.h
@@ -35,6 +35,9 @@
 #    endif
 #endif
 
+//the parallel backend constants
+#define _ONEDPL_MERGE_CUT_OFF 2000
+
 namespace oneapi
 {
 namespace dpl
diff --git a/include/oneapi/dpl/pstl/parallel_backend_serial.h b/include/oneapi/dpl/pstl/parallel_backend_serial.h
@@ -45,7 +45,7 @@ __cancel_execution(oneapi::dpl::__internal::__serial_backend_tag)
 template <class _ExecutionPolicy, class _Index, class _Fp>
 void
 __parallel_for(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPolicy&&, _Index __first, _Index __last,
-               _Fp __f)
+               _Fp __f, std::size_t __grainsize = 1)
 {
     __f(__first, __last);
 }

Original file line number	Diff line number	Diff line change
`@@ -29,10 +29,10 @@ namespace __omp_backend`
`29`	`29`
`30`	`30`	`template <class _Index, class _Fp>`
`31`	`31`	`void`
`32`		`-__parallel_for_body(_Index __first, _Index __last, _Fp __f)`
	`32`	`+__parallel_for_body(_Index __first, _Index __last, _Fp __f, std::size_t __grainsize)`
`33`	`33`	`{`
`34`	`34`	`// initial partition of the iteration space into chunks`
`35`		`- auto __policy = oneapi::dpl::__omp_backend::__chunk_partitioner(__first, __last);`
	`35`	`+ auto __policy = oneapi::dpl::__omp_backend::__chunk_partitioner(__first, __last, __grainsize);`
`36`	`36`
`37`	`37`	`// To avoid over-subscription we use taskloop for the nested parallelism`
`38`	`38`	`_PSTL_PRAGMA(omp taskloop untied mergeable)`
`@@ -49,20 +49,21 @@ __parallel_for_body(_Index __first, _Index __last, _Fp __f)`
`49`	`49`
`50`	`50`	`template <class _ExecutionPolicy, class _Index, class _Fp>`
`51`	`51`	`void`
`52`		`-__parallel_for(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPolicy&&, _Index __first, _Index __last, _Fp __f)`
	`52`	`+__parallel_for(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPolicy&&, _Index __first, _Index __last, _Fp __f,`
	`53`	`+ std::size_t __grainsize = __default_chunk_size)`
`53`	`54`	`{`
`54`	`55`	`if (omp_in_parallel())`
`55`	`56`	`{`
`56`	`57`	`// we don't create a nested parallel region in an existing parallel`
`57`	`58`	`// region: just create tasks`
`58`		`- oneapi::dpl::__omp_backend::__parallel_for_body(__first, __last, __f);`
	`59`	`+ oneapi::dpl::__omp_backend::__parallel_for_body(__first, __last, __f, __grainsize);`
`59`	`60`	`}`
`60`	`61`	`else`
`61`	`62`	`{`
`62`	`63`	`// in any case (nested or non-nested) one parallel region is created and`
`63`	`64`	`// only one thread creates a set of tasks`
`64`	`65`	`_PSTL_PRAGMA(omp parallel)`
`65`		`- _PSTL_PRAGMA(omp single nowait) { oneapi::dpl::__omp_backend::__parallel_for_body(__first, __last, __f); }`
	`66`	`+ _PSTL_PRAGMA(omp single nowait) { oneapi::dpl::__omp_backend::__parallel_for_body(__first, __last, __f, __grainsize); }`
`66`	`67`	`}`
`67`	`68`	`}`
`68`	`69`
Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,9 @@`
`35`	`35`	`# endif`
`36`	`36`	`#endif`
`37`	`37`
	`38`	`+//the parallel backend constants`
	`39`	`+#define _ONEDPL_MERGE_CUT_OFF 2000`
	`40`	`+`
`38`	`41`	`namespace oneapi`
`39`	`42`	`{`
`40`	`43`	`namespace dpl`
Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ __cancel_execution(oneapi::dpl::__internal::__serial_backend_tag)`
`45`	`45`	`template <class _ExecutionPolicy, class _Index, class _Fp>`
`46`	`46`	`void`
`47`	`47`	`__parallel_for(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPolicy&&, _Index __first, _Index __last,`
`48`		`- _Fp __f)`
	`48`	`+ _Fp __f, std::size_t __grainsize = 1)`
`49`	`49`	`{`
`50`	`50`	`__f(__first, __last);`
`51`	`51`	`}`