-
Notifications
You must be signed in to change notification settings - Fork 115
/
Copy pathparallel_backend_sycl_fpga.h
116 lines (99 loc) · 4.71 KB
/
parallel_backend_sycl_fpga.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
// -*- C++ -*-
//===-- parallel_backend_sycl_fpga.h --------------------------------------===//
//
// Copyright (C) Intel Corporation
//
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// This file incorporates work covered by the following copyright and permission
// notice:
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
//
//===----------------------------------------------------------------------===//
//!!! NOTE: This file should be included under the macro _ONEDPL_BACKEND_SYCL
// This header guard is used to check inclusion of DPC++ backend for the FPGA.
// Changing this macro may result in broken tests.
#ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_FPGA_H
#define _ONEDPL_PARALLEL_BACKEND_SYCL_FPGA_H
#include <cassert>
#include <algorithm>
#include <type_traits>
#include "sycl_defs.h"
#include "parallel_backend_sycl_utils.h"
// workaround until we implement more performant optimization for patterns
#include "parallel_backend_sycl.h"
#include "parallel_backend_sycl_histogram.h"
#include "../../execution_impl.h"
#include "execution_sycl_defs.h"
#include "../../iterator_impl.h"
#include "sycl_iterator.h"
#include "sycl_traits.h" //SYCL traits specialization for some oneDPL types.
namespace oneapi
{
namespace dpl
{
namespace __par_backend_hetero
{
//------------------------------------------------------------------------
// parallel_for
//------------------------------------------------------------------------
//General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for,
//for some algorithms happens that size of processing range is n, but amount of iterations is n/2.
// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
template <typename _Name>
struct __parallel_for_fpga_submitter;
template <typename... _Name>
struct __parallel_for_fpga_submitter<__internal::__optional_kernel_name<_Name...>>
{
template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
auto
operator()(const _ExecutionPolicy& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const
{
assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
_PRINT_INFO_IN_DEBUG_MODE(__exec);
auto __event = __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) {
//get an access to data under SYCL buffer:
oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
__cgh.single_task<_Name...>([=]() {
#pragma unroll(::std::decay <_ExecutionPolicy>::type::unroll_factor)
for (auto __idx = 0; __idx < __count; ++__idx)
{
__brick.__scalar_path_impl(std::true_type{}, __idx, __rngs...);
}
});
});
return __future(__event);
}
};
template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
auto
__parallel_for(oneapi::dpl::__internal::__fpga_backend_tag, _ExecutionPolicy&& __exec, _Fp __brick, _Index __count,
_Ranges&&... __rngs)
{
using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
using __parallel_for_name = __internal::__kernel_name_provider<_CustomName>;
return __parallel_for_fpga_submitter<__parallel_for_name>()(__exec, __brick, __count,
std::forward<_Ranges>(__rngs)...);
}
//------------------------------------------------------------------------
// parallel_histogram
//-----------------------------------------------------------------------
// TODO: check if it makes sense to move these wrappers out of backend to a common place
template <typename _ExecutionPolicy, typename _Event, typename _Range1, typename _Range2, typename _BinHashMgr>
auto
__parallel_histogram(oneapi::dpl::__internal::__fpga_backend_tag, _ExecutionPolicy&& __exec, const _Event& __init_event,
_Range1&& __input, _Range2&& __bins, const _BinHashMgr& __binhash_manager)
{
static_assert(sizeof(oneapi::dpl::__internal::__value_t<_Range2>) <= sizeof(::std::uint32_t),
"histogram is not supported on FPGA devices with output types greater than 32 bits");
// workaround until we implement more performant version for patterns
return oneapi::dpl::__par_backend_hetero::__parallel_histogram(
oneapi::dpl::__internal::__device_backend_tag{}, __exec, __init_event, ::std::forward<_Range1>(__input),
::std::forward<_Range2>(__bins), __binhash_manager);
}
} // namespace __par_backend_hetero
} // namespace dpl
} // namespace oneapi
#endif // _ONEDPL_PARALLEL_BACKEND_SYCL_FPGA_H