Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Assorted fixes and OpenCL enhancements #153

Open
wants to merge 5 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 136 additions & 20 deletions libopencl/opencl_runtime_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@
#include <map>
#include <string>

/* Defined in src/cuda-sim/ptx_loader.h. Unused for OpenCL for now. */
int no_of_ptx=0;

static void setErrCode(cl_int *errcode_ret, cl_int err_code) {
if ( errcode_ret ) {
*errcode_ret = err_code;
Expand Down Expand Up @@ -169,6 +172,21 @@ struct _cl_mem {
size_t m_size;
};

struct _cl_event {
private:
cl_ulong start;
cl_ulong cmd_end;
size_t refcount;
public:
_cl_event() : start(0), cmd_end(0), refcount(1) {}
cl_ulong getCmdEnd(void);
void setCmdEnd(cl_ulong);
cl_ulong getStart(void);
void setStart(cl_ulong);
void retain(void);
bool release(void);
};

struct pgm_info {
std::string m_source;
std::string m_asm;
Expand Down Expand Up @@ -355,6 +373,36 @@ _cl_mem::_cl_mem(
}
}

cl_ulong _cl_event::getCmdEnd( void )
{
return cmd_end;
}

void _cl_event::setCmdEnd( cl_ulong e )
{
cmd_end = e;
}

cl_ulong _cl_event::getStart( void )
{
return start;
}

void _cl_event::setStart( cl_ulong s )
{
start = s;
}

void _cl_event::retain( void )
{
refcount++;
}

bool _cl_event::release( void )
{
return ((--refcount) <= 0);
}

_cl_context::_cl_context( struct _cl_device_id *gpu )
{
m_uid = sm_context_uid++;
Expand Down Expand Up @@ -732,15 +780,6 @@ clCreateProgramWithBinary(cl_context /* context */,
return cl_program();
}

extern CL_API_ENTRY cl_int CL_API_CALL
clGetEventProfilingInfo(cl_event /* event */,
cl_profiling_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0{
gpgpusim_opencl_warning(__my_func__,__LINE__, "GPGPUsim - OpenCLFunction is not implemented. Returning CL_SUCCESS");
return CL_SUCCESS;
}
/*******************************************************************************************************/


Expand Down Expand Up @@ -953,7 +992,13 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
if ( err_val != CL_SUCCESS )
return err_val;

gpgpu_t *gpu = command_queue->get_device()->the_device();
gpgpu_sim *gpu = command_queue->get_device()->the_device();

if ( event ) {
*event = new _cl_event();
event[0]->setStart((gpu_tot_sim_cycle * 1000000) / gpu->shader_clock());
}

if (kernel->get_implementation()->get_ptx_version().ver() <3.0){
gpgpu_ptx_sim_memcpy_symbol( "%_global_size", _global_size, 3 * sizeof(int), 0, 1, gpu );
gpgpu_ptx_sim_memcpy_symbol( "%_work_dim", &work_dim, 1 * sizeof(int), 0, 1, gpu );
Expand All @@ -977,6 +1022,10 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
gpgpu_opencl_ptx_sim_main_func( grid );
else
gpgpu_opencl_ptx_sim_main_perf( grid );

if ( event ) {
event[0]->setCmdEnd((gpu_tot_sim_cycle * 1000000) / gpu->shader_clock());
}
return CL_SUCCESS;
}

Expand Down Expand Up @@ -1166,11 +1215,15 @@ clGetDeviceInfo(cl_device_id device,
case CL_DEVICE_NAME: CL_STRING_CASE( "GPGPU-Sim" ); break;
case CL_DEVICE_GLOBAL_MEM_SIZE: CL_ULONG_CASE( 1024*1024*1024 ); break;
case CL_DEVICE_MAX_COMPUTE_UNITS: CL_UINT_CASE( device->the_device()->get_config().num_shader() ); break;
case CL_DEVICE_MAX_CLOCK_FREQUENCY: CL_UINT_CASE( device->the_device()->shader_clock() ); break;
case CL_DEVICE_MAX_CLOCK_FREQUENCY: CL_UINT_CASE( device->the_device()->shader_clock() / 1000 ); break;
case CL_DEVICE_PLATFORM: CL_STRING_CASE("GPGPU-Sim OpenCL platform"); break;
case CL_DEVICE_VENDOR:CL_STRING_CASE("GPGPU-Sim.org"); break;
case CL_DEVICE_VENDOR_ID:CL_UINT_CASE( 0x1337 ); break;
case CL_DEVICE_VERSION: CL_STRING_CASE("OpenCL 1.0"); break;
case CL_DEVICE_OPENCL_C_VERSION: CL_STRING_CASE("OpenCL C 1.0"); break;
case CL_DRIVER_VERSION: CL_STRING_CASE("1.0"); break;
case CL_DEVICE_TYPE: CL_CASE(cl_device_type, CL_DEVICE_TYPE_GPU); break;
case CL_DEVICE_PROFILE: CL_STRING_CASE("FULL_PROFILE"); break;
case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: CL_INT_CASE( 3 ); break;
case CL_DEVICE_MAX_WORK_ITEM_SIZES:
if( param_value && param_value_size < 3*sizeof(size_t) ) return CL_INVALID_VALUE; \
Expand All @@ -1180,10 +1233,11 @@ clGetDeviceInfo(cl_device_id device,
((size_t*)param_value)[1] = n_thread_per_shader;
((size_t*)param_value)[2] = n_thread_per_shader;
}
if( param_value_size_ret ) *param_value_size_ret = 3*sizeof(cl_uint);
if( param_value_size_ret ) *param_value_size_ret = 3*sizeof(size_t);
break;
case CL_DEVICE_MAX_WORK_GROUP_SIZE: CL_INT_CASE( device->the_device()->threads_per_core() ); break;
case CL_DEVICE_ADDRESS_BITS: CL_INT_CASE( 32 ); break;
case CL_DEVICE_ENDIAN_LITTLE: CL_BOOL_CASE( CL_TRUE ); break;
case CL_DEVICE_AVAILABLE: CL_BOOL_CASE( CL_TRUE ); break;
case CL_DEVICE_COMPILER_AVAILABLE: CL_BOOL_CASE( CL_TRUE ); break;
case CL_DEVICE_IMAGE_SUPPORT: CL_INT_CASE( CL_TRUE ); break;
Expand All @@ -1205,16 +1259,17 @@ clGetDeviceInfo(cl_device_id device,
if( param_value ) buf[0]=0;
if( param_value_size_ret ) *param_value_size_ret = 1;
break;
case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR: CL_INT_CASE(1); break;
case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT: CL_INT_CASE(1); break;
case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT: CL_INT_CASE(1); break;
case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG: CL_INT_CASE(1); break;
case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT: CL_INT_CASE(1); break;
case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE: CL_INT_CASE(0); break;
case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR: CL_UINT_CASE(1); break;
case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT: CL_UINT_CASE(1); break;
case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT: CL_UINT_CASE(1); break;
case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG: CL_UINT_CASE(1); break;
case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT: CL_UINT_CASE(1); break;
case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE: CL_UINT_CASE(0); break;
case CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF: CL_UINT_CASE(1); break;
case CL_DEVICE_SINGLE_FP_CONFIG: CL_INT_CASE(0); break;
case CL_DEVICE_MEM_BASE_ADDR_ALIGN: CL_INT_CASE(256*8); break;
default:
opencl_not_implemented(__my_func__,__LINE__);
return CL_INVALID_VALUE;
}
return CL_SUCCESS;
}
Expand All @@ -1225,6 +1280,61 @@ clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0
return CL_SUCCESS;
}

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0
{
return CL_SUCCESS;
}

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0
{
if ( !event )
return CL_INVALID_EVENT;

event->retain();
return CL_SUCCESS;
}

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainKernel(cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0
{
return CL_SUCCESS;
}

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_0
{
return CL_SUCCESS;
}

extern CL_API_ENTRY cl_int CL_API_CALL
clGetEventProfilingInfo(cl_event event,
cl_profiling_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0{
if ( !event )
return CL_INVALID_EVENT;

switch (param_name) {
case CL_PROFILING_COMMAND_QUEUED:
case CL_PROFILING_COMMAND_SUBMIT:
case CL_PROFILING_COMMAND_START:
CL_ULONG_CASE( event->getStart() );
break;
case CL_PROFILING_COMMAND_END:
CL_ULONG_CASE( event->getCmdEnd() );
break;
default:
return CL_INVALID_VALUE;
break;
}

//gpgpusim_opencl_warning(__my_func__,__LINE__, "GPGPUsim - OpenCLFunction is not implemented. Returning CL_SUCCESS");
return CL_SUCCESS;
}

extern CL_API_ENTRY cl_int CL_API_CALL
clGetProgramInfo(cl_program program,
cl_program_info param_name,
Expand Down Expand Up @@ -1374,8 +1484,14 @@ clWaitForEvents(cl_uint /* num_events */,
}

extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0
clReleaseEvent(cl_event e) CL_API_SUFFIX__VERSION_1_0
{
if ( e == nullptr )
return CL_INVALID_EVENT;

if ( e->release() )
delete e;

return CL_SUCCESS;
}

Expand Down
85 changes: 44 additions & 41 deletions src/gpgpu-sim/shader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2998,47 +2998,50 @@ unsigned int shader_core_config::max_cta( const kernel_info_t &k ) const
return result;
}

void shader_core_config::set_pipeline_latency() {

//calculate the max latency based on the input

unsigned int_latency[6];
unsigned fp_latency[5];
unsigned dp_latency[5];
unsigned sfu_latency;
unsigned tensor_latency;

/*
* [0] ADD,SUB
* [1] MAX,Min
* [2] MUL
* [3] MAD
* [4] DIV
* [5] SHFL
*/
sscanf(opcode_latency_int, "%u,%u,%u,%u,%u,%u",
&int_latency[0],&int_latency[1],&int_latency[2],
&int_latency[3],&int_latency[4],&int_latency[5]);
sscanf(opcode_latency_fp, "%u,%u,%u,%u,%u",
&fp_latency[0],&fp_latency[1],&fp_latency[2],
&fp_latency[3],&fp_latency[4]);
sscanf(opcode_latency_dp, "%u,%u,%u,%u,%u",
&dp_latency[0],&dp_latency[1],&dp_latency[2],
&dp_latency[3],&dp_latency[4]);
sscanf(opcode_latency_sfu, "%u",
&sfu_latency);
sscanf(opcode_latency_tensor, "%u",
&tensor_latency);

//all div operation are executed on sfu
//assume that the max latency are dp div or normal sfu_latency
max_sfu_latency = std::max(dp_latency[4],sfu_latency);
//assume that the max operation has the max latency
max_sp_latency = fp_latency[1];
max_int_latency = std::max(int_latency[1],int_latency[5]);
max_dp_latency = dp_latency[1];
max_tensor_core_latency = tensor_latency;

void shader_core_config::set_pipeline_latency()
{
// calculate the max latency based on the input

unsigned int_latency[6];
unsigned fp_latency[5];
unsigned dp_latency[5];
unsigned sfu_latency;
unsigned tensor_latency;

/*
* [0] ADD,SUB
* [1] MAX,Min
* [2] MUL
* [3] MAD
* [4] DIV
* [5] SHFL
*/
sscanf(opcode_latency_int, "%u,%u,%u,%u,%u,%u",
&int_latency[0],&int_latency[1],&int_latency[2],
&int_latency[3],&int_latency[4],&int_latency[5]);
sscanf(opcode_latency_fp, "%u,%u,%u,%u,%u",
&fp_latency[0],&fp_latency[1],&fp_latency[2],
&fp_latency[3],&fp_latency[4]);
sscanf(opcode_latency_dp, "%u,%u,%u,%u,%u",
&dp_latency[0],&dp_latency[1],&dp_latency[2],
&dp_latency[3],&dp_latency[4]);
sscanf(opcode_latency_sfu, "%u",
&sfu_latency);
sscanf(opcode_latency_tensor, "%u",
&tensor_latency);

// all div operation are executed on sfu
// assume that the max latency are dp div or normal sfu_latency
max_sfu_latency = std::max(dp_latency[4],sfu_latency);
// assume that the max operation has the max latency
max_sp_latency = fp_latency[1];
max_int_latency = std::max(int_latency[1],int_latency[5]);
max_dp_latency = dp_latency[1];
max_tensor_core_latency = tensor_latency;

// Fermi GPUs have SP units that perform both FP and int arith.
if (gpgpu_num_int_units == 0)
max_sp_latency = std::max(max_sp_latency, max_int_latency);
}

void shader_core_ctx::cycle()
Expand Down