Skip to content

Commit db5b167

Browse files
authored
Merge pull request #4 from meifeng/feature/omp-usm
Feature/omp usm
2 parents b626a5b + d76483b commit db5b167

39 files changed

Lines changed: 68925 additions & 259 deletions

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
*.x
2+
*.swp
3+
*.out
4+
*.o
25
*.log

Benchmark_REAL.ll

Lines changed: 66728 additions & 0 deletions
Large diffs are not rendered by default.

Grid/allocator/AlignedAllocator.h

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
2929
#ifndef GRID_ALIGNED_ALLOCATOR_H
3030
#define GRID_ALIGNED_ALLOCATOR_H
3131

32+
#include <Grid/threads/Pragmas.h>
3233
#ifdef HAVE_MALLOC_MALLOC_H
3334
#include <malloc/malloc.h>
3435
#endif
@@ -43,8 +44,14 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
4344
#define POINTER_CACHE
4445
#define GRID_ALLOC_ALIGN (2*1024*1024)
4546

47+
#ifdef OMPTARGET_MANAGED
48+
#include <cuda_runtime_api.h>
49+
#endif
4650
NAMESPACE_BEGIN(Grid);
4751

52+
53+
extern "C" void *llvm_omp_target_alloc_shared(size_t,int);
54+
4855
// Move control to configure.ac and Config.h?
4956
#ifdef POINTER_CACHE
5057
class PointerCache {
@@ -165,7 +172,7 @@ class alignedAllocator {
165172
pointer ptr = nullptr;
166173
#endif
167174

168-
#ifdef GRID_NVCC
175+
#if defined(GRID_NVCC) || defined (OMPTARGET_MANAGED)
169176
////////////////////////////////////
170177
// Unified (managed) memory
171178
////////////////////////////////////
@@ -178,6 +185,11 @@ class alignedAllocator {
178185
}
179186
}
180187
assert( ptr != (_Tp *)NULL);
188+
//cudaMemAdvise ( (void*)ptr, bytes, cudaMemAdviseSetPreferredLocation, 0);
189+
#elif defined (OMPTARGET_UVM)
190+
const int device_id = (omp_get_num_devices() > 0) ? omp_get_default_device() : omp_get_initial_device();
191+
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) omp_target_alloc(bytes, device_id);
192+
std::cout <<"OMPTARGET_UVM"<<std::endl;
181193
#else
182194
//////////////////////////////////////////////////////////////////////////////////////////
183195
// 2MB align; could make option probably doesn't need configurability
@@ -189,6 +201,13 @@ class alignedAllocator {
189201
#endif
190202
assert( ptr != (_Tp *)NULL);
191203

204+
//FIXME: NOT WORKING
205+
//#ifdef OMPTARGET
206+
//#pragma omp target enter data map(alloc:ptr[0:__n])
207+
//for(int n=0;n<__n;n++){
208+
//#pragma omp target enter data map(alloc:ptr[n][0:sizeof(_Tp)])
209+
//}
210+
//#endif
192211
//////////////////////////////////////////////////
193212
// First touch optimise in threaded loop
194213
//////////////////////////////////////////////////
@@ -211,14 +230,26 @@ class alignedAllocator {
211230
pointer __freeme = __p;
212231
#endif
213232

214-
#ifdef GRID_NVCC
233+
#if defined(GRID_NVCC) || defined (OMPTARGET_MANAGED)
215234
if ( __freeme ) cudaFree((void *)__freeme);
235+
236+
#elif defined (OMPTARGET_UVM)
237+
const int device_id = (omp_get_num_devices() > 0) ? omp_get_default_device() : omp_get_initial_device();
238+
omp_target_free(__freeme, device_id );
239+
216240
#else
217241
#ifdef HAVE_MM_MALLOC_H
218242
if ( __freeme ) _mm_free((void *)__freeme);
219243
#else
220244
if ( __freeme ) free((void *)__freeme);
221245
#endif
246+
247+
//FIXME: NOT WORKING
248+
//for(int n=0;n<__n;n++){
249+
//#pragma omp target exit data map(delete:__freeme[n][0:sizeof(_Tp)])
250+
//}
251+
//#pragma omp target exit data map(delete:__freeme[0:__n])
252+
222253
#endif
223254
}
224255

Grid/lattice/Lattice_ET.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,17 +83,26 @@ struct getVectorType<Lattice<T> >{
8383
template<class sobj> accelerator_inline
8484
sobj eval(const uint64_t ss, const sobj &arg)
8585
{
86+
#ifdef DEBUG
87+
printf("Lattice_ET.h: eval(const uint64_t ss, const sobj &arg)\n");
88+
#endif
8689
return arg;
8790
}
8891

8992
template <class lobj> accelerator_inline
9093
const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg)
9194
{
95+
#ifdef DEBUG
96+
printf("Lattice_ET.h: eval(const uint64_t ss, const LatticeView<lobj> &arg)\n");
97+
#endif
9298
return arg[ss];
9399
}
94100
template <class lobj> accelerator_inline
95101
const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg)
96102
{
103+
#ifdef DEBUG
104+
printf("Lattice_ET.h: eval(const uint64_t ss, const Lattice<lobj> &arg)\n");
105+
#endif
97106
auto view = arg.View();
98107
return view[ss];
99108
}
@@ -114,6 +123,12 @@ template <typename Op, typename T1, typename T2> accelerator_inline
114123
auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)
115124
-> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2)))
116125
{
126+
#ifdef DEBUG
127+
printf("eval in lattice/Lattice_ET.h: expr.arg1=%f\n",expr.arg1[ss]._internal._internal._internal.v.v[0]);
128+
printf("eval in lattice/Lattice_ET.h: expr.arg2=%f\n",expr.arg2[ss]._internal._internal._internal.v.v[0]);
129+
auto tmp=expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) );
130+
printf("eval in lattice/Lattice_ET.h: eval= %f\n",tmp._internal._internal._internal.v.v[0]);
131+
#endif
117132
return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) );
118133
}
119134
///////////////////////

Grid/lattice/Lattice_base.h

Lines changed: 54 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,11 @@ void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
6060
#define LATTICE_VIEW_STRICT
6161
template<class vobj> class LatticeAccelerator : public LatticeBase
6262
{
63-
protected:
63+
public:
6464
GridBase *_grid;
6565
int checkerboard;
6666
vobj *_odata; // A managed pointer
6767
uint64_t _odata_size;
68-
public:
6968
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr) { };
7069
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
7170
accelerator_inline int Checkerboard(void) const { return checkerboard; };
@@ -230,6 +229,7 @@ class Lattice : public LatticeAccelerator<vobj>
230229
this->checkerboard=cb;
231230

232231
auto me = View();
232+
//#pragma omp target teams distribute parallel for
233233
accelerator_for(ss,me.size(),1,{
234234
auto tmp = eval(ss,expr);
235235
vstream(me[ss],tmp);
@@ -249,9 +249,26 @@ class Lattice : public LatticeAccelerator<vobj>
249249
this->checkerboard=cb;
250250

251251
auto me = View();
252+
int size = me.size();
253+
// printf("size:%d\n",size);
254+
/*
255+
auto in1 = expr.arg1;
256+
auto in2 = expr.arg2;
257+
258+
int in_size = in1.size();
259+
auto in1_ptr = &in1[0];
260+
auto in2_ptr = &in2[0];
261+
int me_size = me.size();
262+
auto me_ptr = &me[0];
263+
*/
264+
265+
//#pragma omp target teams distribute parallel for
252266
accelerator_for(ss,me.size(),1,{
253-
auto tmp = eval(ss,expr);
254-
vstream(me[ss],tmp);
267+
me[ss] = eval(ss,expr);
268+
#ifdef DEBUG
269+
if (ss==0) printf("operator= in lattice/Lattice_base.h: me[ss] = %f\n",me[ss]._internal._internal._internal.v.v[0]);
270+
#endif
271+
//vstream(me[ss],tmp);
255272
});
256273
return *this;
257274
}
@@ -284,10 +301,13 @@ class Lattice : public LatticeAccelerator<vobj>
284301
CBFromExpression(cb,expr);
285302
assert( (cb==Odd) || (cb==Even));
286303
this->checkerboard=cb;
287-
304+
int gsize = this->_grid->oSites();
288305
resize(this->_grid->oSites());
289-
290306
*this = expr;
307+
std::cout<<"1"<<std::endl;
308+
//#pragma omp target enter data map(to:this[0:1])
309+
//#pragma omp target enter data map(to:this->_odata[0:gsize])
310+
std::cout<<"2"<<std::endl;
291311
}
292312
template<class Op,class T1, class T2>
293313
Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
@@ -300,9 +320,14 @@ class Lattice : public LatticeAccelerator<vobj>
300320
assert( (cb==Odd) || (cb==Even));
301321
this->checkerboard=cb;
302322

323+
int gsize = this->_grid->oSites();
303324
resize(this->_grid->oSites());
304325

305326
*this = expr;
327+
std::cout<<"3"<<std::endl;
328+
//#pragma omp target enter data map(to:this[0:1])
329+
//#pragma omp target enter data map(to:this->_odata[0:gsize])
330+
std::cout<<"4"<<std::endl;
306331
}
307332
template<class Op,class T1, class T2, class T3>
308333
Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
@@ -315,16 +340,23 @@ class Lattice : public LatticeAccelerator<vobj>
315340
assert( (cb==Odd) || (cb==Even));
316341
this->checkerboard=cb;
317342

343+
int gsize = this->_grid->oSites();
318344
resize(this->_grid->oSites());
319345

320346
*this = expr;
347+
std::cout<<"5"<<std::endl;
348+
//#pragma omp target enter data map(to:this[0:1])
349+
//#pragma omp target enter data map(to:this->_odata[0:gsize])
350+
std::cout<<"6"<<std::endl;
321351
}
322352

323353
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
324354
auto me = View();
325355
thread_for(ss,me.size(),{
326356
me[ss] = r;
327357
});
358+
//std::cout<<"AA"<<std::endl;
359+
//#pragma omp target update to(me[0:me.size()])
328360
return *this;
329361
}
330362

@@ -339,6 +371,12 @@ class Lattice : public LatticeAccelerator<vobj>
339371
resize(this->_grid->oSites());
340372
assert((((uint64_t)&this->_odata[0])&0xF) ==0);
341373
this->checkerboard=0;
374+
int gsize=this->_grid->oSites();
375+
//std::cout<<"7"<<std::endl;
376+
//#pragma omp target enter data map(to:this[0:1])
377+
//#pragma omp target enter data map(to:this->_odata[0:gsize])
378+
//std::cout<<"8"<<std::endl;
379+
342380
}
343381

344382
// virtual ~Lattice(void) = default;
@@ -358,6 +396,12 @@ class Lattice : public LatticeAccelerator<vobj>
358396
this->_grid = r.Grid();
359397
resize(this->_grid->oSites());
360398
*this = r;
399+
400+
int gsize=this->_grid->oSites();
401+
std::cout<<"9"<<std::endl;
402+
//#pragma omp target enter data map(to:this[0:1])
403+
//#pragma omp target enter data map(to:this->_odata[0:gsize])
404+
std::cout<<"10"<<std::endl;
361405
}
362406
///////////////////////////////////////////
363407
// move constructor
@@ -382,6 +426,7 @@ class Lattice : public LatticeAccelerator<vobj>
382426
accelerator_for(ss,me.size(),vobj::Nsimd(),{
383427
coalescedWrite(me[ss],him(ss));
384428
});
429+
std::cout<<"BB"<<std::endl;
385430
return *this;
386431
}
387432

@@ -396,6 +441,7 @@ class Lattice : public LatticeAccelerator<vobj>
396441
accelerator_for(ss,me.size(),vobj::Nsimd(),{
397442
coalescedWrite(me[ss],him(ss));
398443
});
444+
std::cout<<"CC"<<std::endl;
399445
return *this;
400446
}
401447
///////////////////////////////////////////
@@ -411,7 +457,8 @@ class Lattice : public LatticeAccelerator<vobj>
411457

412458
r._odata = nullptr;
413459
r._odata_size = 0;
414-
460+
std::cout<<"DD"<<std::endl;
461+
415462
return *this;
416463
}
417464

0 commit comments

Comments
 (0)