diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 000000000..f5843f2be --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,4 @@ +{ + "name": "CUDA 12.8", + "image": "ghcr.io/accel-sim/accel-sim-framework:Ubuntu-24.04-cuda-12.8" +} \ No newline at end of file diff --git a/.devcontainer/sst_integration/devcontainer.json b/.devcontainer/sst_integration/devcontainer.json new file mode 100644 index 000000000..e3732b4e3 --- /dev/null +++ b/.devcontainer/sst_integration/devcontainer.json @@ -0,0 +1,4 @@ +{ + "name": "SST CUDA 11.7", + "image": "ghcr.io/accel-sim/accel-sim-framework:SST-Integration-Ubuntu-22.04-cuda-11.7-llvm-18.1.8-riscv-gnu-2024.08.06-nightly" +} \ No newline at end of file diff --git a/.github/workflows/accelsim.yml b/.github/workflows/accelsim.yml index 741035226..39ae46df0 100644 --- a/.github/workflows/accelsim.yml +++ b/.github/workflows/accelsim.yml @@ -22,7 +22,7 @@ jobs: build-QV100: runs-on: ubuntu-latest container: - image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7 + image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8 # Steps represent a sequence of tasks that will be executed as part of the job steps: diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index c0a22ebf2..c37bd412b 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -18,7 +18,7 @@ jobs: build-TITANV: runs-on: ubuntu-latest container: - image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7 + image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8 env: CONFIG: TITANV @@ -32,7 +32,7 @@ jobs: build-TITANV-LOCALXBAR: runs-on: ubuntu-latest container: - image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7 + image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8 env: CONFIG: TITANV-LOCALXBAR @@ -46,7 +46,7 @@ jobs: build-QV100: runs-on: ubuntu-latest container: - image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7 + image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8 env: CONFIG: QV100 @@ -60,7 +60,7 @@ jobs: build-2060: runs-on: ubuntu-latest container: - image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7 + image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8 env: CONFIG: RTX2060 @@ -74,7 +74,7 @@ jobs: build-3070: runs-on: ubuntu-latest container: - image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7 + image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8 env: CONFIG: RTX3070 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 954b98faf..a4f838daf 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -18,7 +18,7 @@ jobs: build-TITANV: runs-on: ubuntu-latest container: - image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7 + image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8 env: CONFIG: TITANV @@ -32,7 +32,7 @@ jobs: build-TITANV-LOCALXBAR: runs-on: ubuntu-latest container: - image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7 + image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8 env: CONFIG: TITANV-LOCALXBAR @@ -46,7 +46,7 @@ jobs: build-QV100: runs-on: ubuntu-latest container: - image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7 + image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8 env: CONFIG: QV100 @@ -60,7 +60,7 @@ jobs: build-2060: runs-on: ubuntu-latest container: - image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7 + image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8 env: CONFIG: RTX2060 @@ -74,7 +74,7 @@ jobs: build-3070: runs-on: ubuntu-latest container: - image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7 + image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8 env: CONFIG: RTX3070 diff --git a/.gitignore b/.gitignore index 340277af8..c27f9218d 100644 --- a/.gitignore +++ b/.gitignore @@ -56,9 +56,17 @@ debug_tools/WatchYourStep/ptxjitplus/gpgpu* debug_tools/WatchYourStep/ptxjitplus/*.old debug_tools/WatchYourStep/ptxjitplus/ptxjitplus debug_tools/WatchYourStep/ptxjitplus/*.ptx +*.tmp # Accel-sim packages used for regressions accel-sim-framework/ gpu-app-collection/ setup + +# OS/IDE specific files +.idea/ +.vscode/ +.DS_Store +.DS_store +__pycache__/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 95ca8e085..2292e8b1b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -164,4 +164,8 @@ install(CODE "execute_process\(\ install(CODE "execute_process\(\ COMMAND ${CMAKE_COMMAND} -E create_symlink \ ${GPGPUSIM_INSTALL_PATH}/$ \ - ${GPGPUSIM_INSTALL_PATH}/$.11.0\)") \ No newline at end of file + ${GPGPUSIM_INSTALL_PATH}/$.11.0\)") +install(CODE "execute_process\(\ + COMMAND ${CMAKE_COMMAND} -E create_symlink \ + ${GPGPUSIM_INSTALL_PATH}/$ \ + ${GPGPUSIM_INSTALL_PATH}/$.12\)") diff --git a/Makefile b/Makefile index 37dba0146..35457c98b 100644 --- a/Makefile +++ b/Makefile @@ -169,6 +169,7 @@ $(SIM_LIB_DIR)/libcudart.so: makedirs $(LIBS) cudalib if [ ! -f $(SIM_LIB_DIR)/libcudart.so.10.0 ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart.so.10.0; fi if [ ! -f $(SIM_LIB_DIR)/libcudart.so.10.1 ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart.so.10.1; fi if [ ! -f $(SIM_LIB_DIR)/libcudart.so.11.0 ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart.so.11.0; fi + if [ ! -f $(SIM_LIB_DIR)/libcudart.so.12 ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart.so.12; fi if [ ! -f $(SIM_LIB_DIR)/libcudart_mod.so ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart_mod.so; fi $(SIM_LIB_DIR)/libcudart.dylib: makedirs $(LIBS) cudalib diff --git a/bitbucket-pipelines.yml b/bitbucket-pipelines.yml deleted file mode 100644 index 6e331d424..000000000 --- a/bitbucket-pipelines.yml +++ /dev/null @@ -1,15 +0,0 @@ -# This is a sample build configuration for C++ – Make. -# Check our guides at https://confluence.atlassian.com/x/5Q4SMw for more examples. -# Only use spaces to indent your .yml configuration. -# ----- -# You can specify a custom docker image from Docker Hub as your build environment. -image: tgrogers/gpgpu-sim_regress:latest - -pipelines: - default: - - step: - script: # Modify the commands below to build your repository. - - docker run -v `pwd`:/home/runner/gpgpu-sim_distribution:rw tgrogers/gpgpu-sim_regress:latest /bin/bash -c "./start_torque.sh; chown -R runner /home/runner/gpgpu-sim_distribution; su - runner -c 'source /home/runner/gpgpu-sim_distribution/setup_environment && make -j -C /home/runner/gpgpu-sim_distribution && cd /home/runner/gpgpu-sim_simulations/ && git pull && /home/runner/gpgpu-sim_simulations/util/job_launching/run_simulations.py -c /home/runner/gpgpu-sim_simulations/util/job_launching/regression_recipies/rodinia_2.0-ft/configs.gtx1080ti.yml -N regress && /home/runner/gpgpu-sim_simulations/util/job_launching/monitor_func_test.py -v -N regress'" - services: - - docker - \ No newline at end of file diff --git a/configs/tested-cfgs/SM2_GTX480/gpgpusim.config b/configs/tested-cfgs/SM2_GTX480/gpgpusim.config index bc01821db..47637286e 100644 --- a/configs/tested-cfgs/SM2_GTX480/gpgpusim.config +++ b/configs/tested-cfgs/SM2_GTX480/gpgpusim.config @@ -56,8 +56,8 @@ # In Fermi, the cache and shared memory can be configured to 16kb:48kb(default) or 48kb:16kb -# :::,::::,::,:** -# ** Optional parameter - Required when mshr_type==Texture Fifo +# :::,::::,::,:**, +# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used # Note: Hashing set index function (H) only applies to a set size of 32 or 64. -gpgpu_cache:dl1 N:32:128:4,L:L:m:N:H,S:64:8,8 -gpgpu_shmem_size 49152 diff --git a/configs/tested-cfgs/SM3_KEPLER_TITAN/gpgpusim.config b/configs/tested-cfgs/SM3_KEPLER_TITAN/gpgpusim.config index ef47ddfd9..9cb328f5d 100644 --- a/configs/tested-cfgs/SM3_KEPLER_TITAN/gpgpusim.config +++ b/configs/tested-cfgs/SM3_KEPLER_TITAN/gpgpusim.config @@ -99,8 +99,8 @@ # Greedy then oldest scheduler -gpgpu_scheduler gto -# :::,::::,::,:** -# ** Optional parameter - Required when mshr_type==Texture Fifo +# :::,::::,::,:**, +# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used # Note: Hashing set index function (H) only applies to a set size of 32 or 64. # The defulat is to disable the L1 cache, unless cache modifieres are used -gpgpu_cache:dl1 S:4:128:32,L:L:s:N:L,A:256:8,16:0,32 diff --git a/configs/tested-cfgs/SM6_TITANX/gpgpusim.config b/configs/tested-cfgs/SM6_TITANX/gpgpusim.config index 7d3e2d47e..882630e76 100644 --- a/configs/tested-cfgs/SM6_TITANX/gpgpusim.config +++ b/configs/tested-cfgs/SM6_TITANX/gpgpusim.config @@ -123,8 +123,8 @@ -gpgpu_dual_issue_diff_exec_units 1 ## L1/shared memory configuration -# :::,::::,::,:** -# ** Optional parameter - Required when mshr_type==Texture Fifo +# :::,::::,::,:**, +# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used # Note: Hashing set index function (H) only applies to a set size of 32 or 64. # The defulat is to disable the L1 cache, unless cache modifieres are used -gpgpu_l1_banks 2 diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config index 6ff4b6c08..8cc3ed6bf 100644 --- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config +++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config @@ -83,8 +83,8 @@ -gpgpu_dual_issue_diff_exec_units 1 ## L1/shared memory configuration -# :::,::::,::,:** -# ** Optional parameter - Required when mshr_type==Texture Fifo +# :::,::::,::,:**, +# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used # In adaptive cache, we adaptively assign the remaining shared memory to L1 cache # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x -gpgpu_adaptive_cache_config 1 diff --git a/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config index 08ac75277..290c08d6c 100644 --- a/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config +++ b/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config @@ -128,8 +128,8 @@ -gpgpu_num_reg_banks 16 -gpgpu_reg_file_port_throughput 2 -# :::,::::,::,:** -# ** Optional parameter - Required when mshr_type==Texture Fifo +# :::,::::,::,:**, +# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used -gpgpu_adaptive_cache_config 0 -gpgpu_l1_banks 4 -gpgpu_cache:dl1 S:1:128:512,L:L:s:N:L,A:256:8,16:0,32 diff --git a/configs/tested-cfgs/SM7_GV100/gpgpusim.config b/configs/tested-cfgs/SM7_GV100/gpgpusim.config index 26ce0eb58..4887be8cf 100644 --- a/configs/tested-cfgs/SM7_GV100/gpgpusim.config +++ b/configs/tested-cfgs/SM7_GV100/gpgpusim.config @@ -137,8 +137,8 @@ -gpgpu_dual_issue_diff_exec_units 1 ## L1/shared memory configuration -# :::,::::,::,:** -# ** Optional parameter - Required when mshr_type==Texture Fifo +# :::,::::,::,:**, +# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used # Defualt config is 32KB DL1 and 96KB shared memory # In Volta, we assign the remaining shared memory to L1 cache # if the assigned shd mem = 0, then L1 cache = 128KB diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config index b3384afcb..a14ae7567 100644 --- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config +++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config @@ -137,8 +137,8 @@ -gpgpu_dual_issue_diff_exec_units 1 ## L1/shared memory configuration -# :::,::::,::,:** -# ** Optional parameter - Required when mshr_type==Texture Fifo +# :::,::::,::,:**, +# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used # Defualt config is 32KB DL1 and 96KB shared memory # In Volta, we assign the remaining shared memory to L1 cache # if the assigned shd mem = 0, then L1 cache = 128KB diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config index c37aaf053..b48f37006 100644 --- a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config +++ b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config @@ -107,8 +107,8 @@ -gpgpu_dual_issue_diff_exec_units 1 ## L1/shared memory configuration -# :::,::::,::,:** -# ** Optional parameter - Required when mshr_type==Texture Fifo +# :::,::::,::,:**, +# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used # Defualt config is 32KB DL1 and 96KB shared memory # In Volta, we assign the remaining shared memory to L1 cache # if the assigned shd mem = 0, then L1 cache = 128KB diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config index d26b1a621..c884541bb 100644 --- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config +++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config @@ -83,8 +83,8 @@ -gpgpu_dual_issue_diff_exec_units 1 ## L1/shared memory configuration -# :::,::::,::,:** -# ** Optional parameter - Required when mshr_type==Texture Fifo +# :::,::::,::,:**, +# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used # In adaptive cache, we adaptively assign the remaining shared memory to L1 cache # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x -gpgpu_adaptive_cache_config 1 diff --git a/format-code.sh b/format-code.sh index acd33ab1c..b83510d20 100755 --- a/format-code.sh +++ b/format-code.sh @@ -1,13 +1,13 @@ # This bash script formats GPGPU-Sim using clang-format THIS_DIR="$( cd "$( dirname "$BASH_SOURCE" )" && pwd )" echo "Running clang-format on $THIS_DIR" -clang-format -i ${THIS_DIR}/libcuda/*.h -clang-format -i ${THIS_DIR}/libcuda/*.cc -clang-format -i ${THIS_DIR}/src/*.h -clang-format -i ${THIS_DIR}/src/*.cc -clang-format -i ${THIS_DIR}/src/gpgpu-sim/*.h -clang-format -i ${THIS_DIR}/src/gpgpu-sim/*.cc -clang-format -i ${THIS_DIR}/src/cuda-sim/*.h -clang-format -i ${THIS_DIR}/src/cuda-sim/*.cc -clang-format -i ${THIS_DIR}/src/accelwattch/*.h -clang-format -i ${THIS_DIR}/src/accelwattch/*.cc \ No newline at end of file +clang-format -i ${THIS_DIR}/libcuda/*.h --style=file:${THIS_DIR}/.clang-format +clang-format -i ${THIS_DIR}/libcuda/*.cc --style=file:${THIS_DIR}/.clang-format +clang-format -i ${THIS_DIR}/src/*.h --style=file:${THIS_DIR}/.clang-format +clang-format -i ${THIS_DIR}/src/*.cc --style=file:${THIS_DIR}/.clang-format +clang-format -i ${THIS_DIR}/src/gpgpu-sim/*.h --style=file:${THIS_DIR}/.clang-format +clang-format -i ${THIS_DIR}/src/gpgpu-sim/*.cc --style=file:${THIS_DIR}/.clang-format +clang-format -i ${THIS_DIR}/src/cuda-sim/*.h --style=file:${THIS_DIR}/.clang-format +clang-format -i ${THIS_DIR}/src/cuda-sim/*.cc --style=file:${THIS_DIR}/.clang-format +clang-format -i ${THIS_DIR}/src/accelwattch/*.h --style=file:${THIS_DIR}/.clang-format +clang-format -i ${THIS_DIR}/src/accelwattch/*.cc --style=file:${THIS_DIR}/.clang-format diff --git a/gpgpusim_check.cmake b/gpgpusim_check.cmake index 486d66dc9..7adf34286 100644 --- a/gpgpusim_check.cmake +++ b/gpgpusim_check.cmake @@ -63,8 +63,8 @@ else() message(CHECK_PASS "${CUDAToolkit_NVCC_EXECUTABLE}") message(CHECK_START "Checking CUDA compiler version") message(CHECK_PASS "${CUDAToolkit_VERSION}") - if((CUDAToolkit_VERSION VERSION_LESS 2.0.3) OR (CUDAToolkit_VERSION VERSION_GREATER 11.10.0)) - message(FATAL_ERROR "GPGPU-Sim ${CMAKE_PROJECT_VERSION} not tested with CUDA version ${CUDAToolkit_VERSION} (please see README)") + if((CUDAToolkit_VERSION VERSION_LESS 2.0.3) OR (CUDAToolkit_VERSION VERSION_GREATER 13.0.0)) + message(WARNING "GPGPU-Sim not tested with CUDA version ${CUDAToolkit_VERSION} (please see README)") endif() endif() @@ -132,4 +132,4 @@ list(POP_BACK CMAKE_MESSAGE_INDENT) message(CHECK_PASS "done") message(STATUS "Be sure to run 'source setup' " "before you run CUDA program with GPGPU-Sim or building with external " - "simulator like SST") \ No newline at end of file + "simulator like SST") diff --git a/libcuda/cuda_api.h b/libcuda/cuda_api.h index 5a970ba01..52f36ebdb 100644 --- a/libcuda/cuda_api.h +++ b/libcuda/cuda_api.h @@ -2607,12 +2607,12 @@ typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st { /** * Device that represents the CPU */ -#define CU_DEVICE_CPU ((CUdevice)-1) +#define CU_DEVICE_CPU ((CUdevice) - 1) /** * Device that represents an invalid device */ -#define CU_DEVICE_INVALID ((CUdevice)-2) +#define CU_DEVICE_INVALID ((CUdevice) - 2) /** @} */ /* END CUDA_TYPES */ diff --git a/libcuda/cuda_api_object.h b/libcuda/cuda_api_object.h index e620e5728..3e34641ed 100644 --- a/libcuda/cuda_api_object.h +++ b/libcuda/cuda_api_object.h @@ -35,9 +35,7 @@ struct _cuda_device_id { m_next = NULL; m_gpgpu = gpu; } - struct _cuda_device_id *next() { - return m_next; - } + struct _cuda_device_id *next() { return m_next; } unsigned num_shader() const { return m_gpgpu->get_config().num_shader(); } int num_devices() const { if (m_next == NULL) @@ -158,9 +156,7 @@ class kernel_config { void set_grid_dim(dim3 *d) { m_GridDim = *d; } void set_block_dim(dim3 *d) { m_BlockDim = *d; } gpgpu_ptx_sim_arg_list_t get_args() { return m_args; } - struct CUstream_st *get_stream() { - return m_stream; - } + struct CUstream_st *get_stream() { return m_stream; } private: dim3 m_GridDim; diff --git a/libcuda/cuda_runtime_api.cc b/libcuda/cuda_runtime_api.cc index 3574fc17d..8de12a335 100644 --- a/libcuda/cuda_runtime_api.cc +++ b/libcuda/cuda_runtime_api.cc @@ -1809,6 +1809,8 @@ cudaDeviceGetAttributeInternal(int *value, enum cudaDeviceAttr attr, int device, case 19: *value = 0; break; + case 20: // cudaDevAttrComputeMode for controlling cudaSetDevice for threads + *value = 0; // Dummy value, should not affect simulation case 21: case 22: case 23: @@ -2120,8 +2122,10 @@ __host__ cudaError_t CUDARTAPI cudaStreamSynchronizeInternal( announce_call(__my_func__); } #if (CUDART_VERSION >= 3000) - if (stream == NULL) ctx->synchronize(); - return g_last_cudaError = cudaSuccess; + if (stream == NULL) { + ctx->synchronize(); + return g_last_cudaError = cudaSuccess; + } stream->synchronize(); #else printf( @@ -2427,6 +2431,18 @@ void SST_gpgpusim_numcores_equal_check(unsigned sst_numcores) { ->SST_gpgpusim_numcores_equal_check(sst_numcores); } +/** + * @brief For SST to check if kernel launch is blocking + * Future: we will need a better interface to the + * GPGPU-Sim config for integration with outside + * simulators. + * + */ +bool SST_gpgpusim_launch_blocking() { + return GPGPU_Context()->the_gpgpusim->g_stream_manager->is_blocking(); + +} + uint64_t cudaMallocSST(void **devPtr, size_t size) { if (g_debug_execution >= 3) { announce_call(__my_func__); @@ -2977,6 +2993,40 @@ __host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t stream) { return cudaStreamSynchronizeInternal(stream); } +__host__ cudaError_t CUDARTAPI cudaStreamSynchronizeSST(cudaStream_t stream) { + // For SST, perform a one-time check + gpgpu_context *ctx = GPGPU_Context(); + if (g_debug_execution >= 3) { + announce_call(__my_func__); + } + + // default stream: all is done + // other streams: no more ops + g_last_cudaError = cudaSuccess; + if (stream == NULL) { + // For default stream, sync is equivalent to cudaThreadSync + bool thread_synced = ctx->synchronize_check(); + if (thread_synced) { + // We are already done, so no need to poll for sync done + return cudaSuccess; + } else { + // Otherwise we mark we should wait for default strem to sync + ctx->the_gpgpusim->g_stream_manager->get_stream_zero()->set_request_synchronize(); + return cudaErrorNotReady; + } + } else { + // For other stream, check if it is already sync'ed + bool stream_synced = stream->synchronize_check(); + if (stream_synced) { + return cudaSuccess; + } else { + stream->set_request_synchronize(); + return cudaErrorNotReady; + } + } + return g_last_cudaError = cudaSuccess; +} + __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream) { if (g_debug_execution >= 3) { announce_call(__my_func__); @@ -3046,13 +3096,34 @@ __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event) { printf("GPGPU-Sim API: cudaEventSynchronize ** waiting for event\n"); fflush(stdout); CUevent_st *e = (CUevent_st *)event; - while (!e->done()) - ; + while (!e->done()); printf("GPGPU-Sim API: cudaEventSynchronize ** event detected\n"); fflush(stdout); return g_last_cudaError = cudaSuccess; } +__host__ cudaError_t CUDARTAPI cudaEventSynchronizeSST(cudaEvent_t event) { + // For SST, perform a one-time check + // and let stream manager send the callback once the event is done + if (g_debug_execution >= 3) { + announce_call(__my_func__); + } + printf("GPGPU-Sim API: cudaEventSynchronize ** waiting for event\n"); + fflush(stdout); + CUevent_st *e = (CUevent_st *)event; + bool event_sync_done = e->done(); + if (event_sync_done) { + printf("GPGPU-Sim API: cudaEventSynchronize ** event detected\n"); + fflush(stdout); + return cudaSuccess; + } else { + printf("GPGPU-Sim API: cudaEventSynchronize ** still waiting for event\n"); + // Mark this event as waiting for synchronization + e->set_request_synchronize(); + return cudaErrorNotReady; + } +} + __host__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event) { if (g_debug_execution >= 3) { announce_call(__my_func__); @@ -3112,6 +3183,7 @@ __host__ cudaError_t CUDARTAPI cudaThreadSynchronizeSST(void) { ctx->requested_synchronize = false; return cudaSuccess; } else { + ctx->requested_synchronize = true; return cudaErrorNotReady; } } @@ -3171,7 +3243,7 @@ __host__ cudaError_t CUDARTAPI cudaGetExportTable( * * *******************************************************************************/ -//#include "../../cuobjdump_to_ptxplus/cuobjdump_parser.h" +// #include "../../cuobjdump_to_ptxplus/cuobjdump_parser.h" // extracts all ptx files from binary and dumps into // prog_name.unique_no.sm_<>.ptx files @@ -4021,6 +4093,18 @@ cudaError_t CUDARTAPI cudaSetDeviceFlags(int flags) { } } +cudaError_t CUDARTAPI cudaSetDeviceFlagsSST(int flags) { + if (g_debug_execution >= 3) { + announce_call(__my_func__); + } + // SST's simple stream example relies on this + // currently just set it to no-op + printf( + "GPGPU-Sim PTX: Execution warning: ignoring call to \"%s ( flag=%p)\"\n", + __my_func__, flags); + return cudaSuccess; +} + cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const char *hostFun) { return cudaFuncGetAttributesInternal(attr, hostFun); @@ -4068,9 +4152,9 @@ __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit, return g_last_cudaError = cudaSuccess; } -//#if CUDART_VERSION >= 9000 +// #if CUDART_VERSION >= 9000 //__host__ cudaError_t cudaFuncSetAttribute ( const void* func, enum -// cudaFuncAttribute attr, int value ) { +// cudaFuncAttribute attr, int value ) { // ignore this Attribute for now, and the default is that carveout = // cudaSharedmemCarveoutDefault; // (-1) diff --git a/linux-so-version.txt b/linux-so-version.txt index 3db07aca6..55d372748 100644 --- a/linux-so-version.txt +++ b/linux-so-version.txt @@ -10,5 +10,7 @@ libcudart.so.10.1{ }; libcudart.so.11.0{ }; +libcudart.so.12{ +}; libcuda.so.1{ }; diff --git a/new b/new deleted file mode 100644 index 064543fd2..000000000 --- a/new +++ /dev/null @@ -1,7 +0,0 @@ -sudo docker run --dns 206.87.227.99 --privileged -v `pwd`:/home/runner/gpgpu-sim_distribution:rw tgrogers/gpgpu-sim_regress:volta_update /bin/bash -c "./start_torque.sh; chown -R runner /home/runner/gpgpu-sim_distribution; su - runner -c 'export CUDA_INSTALL_PATH=/usr/local/cuda-4.2/ && export PTXAS_CUDA_INSTALL_PATH=/usr/local/cuda-4.2/ && make clean -C /home/runner/gpgpu-sim_distribution && source /home/runner/gpgpu-sim_distribution/setup_environment && make -j -C /home/runner/gpgpu-sim_distribution && cd /home/runner/gpgpu-sim_simulations/ && git pull && /home/runner/gpgpu-sim_simulations/util/job_launching/run_simulations.py -C GTX480 -B rodinia_2.0-ft -N regress && /home/runner/gpgpu-sim_simulations/util/job_launching/monitor_func_test.py -v -N regress'" -# - services: docker -# env: -# - CONFIG=GTX480 -# - CUDA_INSTALL_PATH=/usr/local/cuda-4.2/ -# - PTXAS_CUDA_INSTALL_PATH=/usr/local/cuda-4.2/ - diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc index e8ddf95ab..8743cc7a7 100644 --- a/src/abstract_hardware_model.cc +++ b/src/abstract_hardware_model.cc @@ -216,20 +216,20 @@ new_addr_type line_size_based_tag_func(new_addr_type address, return address & ~(line_size - 1); } -const char *mem_access_type_str(enum mem_access_type access_type) { +const char *mem_access_type_str(enum mem_access_type access_type){ #define MA_TUP_BEGIN(X) static const char *access_type_str[] = { #define MA_TUP(X) #X #define MA_TUP_END(X) \ } \ ; - MEM_ACCESS_TYPE_TUP_DEF + MEM_ACCESS_TYPE_TUP_DEF #undef MA_TUP_BEGIN #undef MA_TUP #undef MA_TUP_END - assert(access_type < NUM_MEM_ACCESS_TYPE); + assert(access_type < NUM_MEM_ACCESS_TYPE); - return access_type_str[access_type]; +return access_type_str[access_type]; } void warp_inst_t::clear_active(const active_mask_t &inactive) { diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h index 98a403997..cddf523e7 100644 --- a/src/abstract_hardware_model.h +++ b/src/abstract_hardware_model.h @@ -248,9 +248,7 @@ class kernel_info_t { } bool running() const { return m_num_cores_running > 0; } bool done() const { return no_more_ctas_to_run() && !running(); } - class function_info *entry() { - return m_kernel_entry; - } + class function_info *entry() { return m_kernel_entry; } const class function_info *entry() const { return m_kernel_entry; } size_t num_blocks() const { @@ -300,9 +298,7 @@ class kernel_info_t { std::list &active_threads() { return m_active_threads; } - class memory_space *get_param_memory() { - return m_param_mem; - } + class memory_space *get_param_memory() { return m_param_mem; } // The following functions access texture bindings present at the kernel's // launch @@ -609,15 +605,9 @@ class gpgpu_t { void memcpy_from_gpu(void *dst, size_t src_start_addr, size_t count); void memcpy_gpu_to_gpu(size_t dst, size_t src, size_t count); - class memory_space *get_global_memory() { - return m_global_mem; - } - class memory_space *get_tex_memory() { - return m_tex_mem; - } - class memory_space *get_surf_memory() { - return m_surf_mem; - } + class memory_space *get_global_memory() { return m_global_mem; } + class memory_space *get_tex_memory() { return m_tex_mem; } + class memory_space *get_surf_memory() { return m_surf_mem; } void gpgpu_ptx_sim_bindTextureToArray(const struct textureReference *texref, const struct cudaArray *array); @@ -701,6 +691,7 @@ struct gpgpu_ptx_sim_info { int cmem; int gmem; int regs; + int barriers; unsigned maxthreads; unsigned ptx_version; unsigned sm_target; @@ -1336,9 +1327,7 @@ class core_t { virtual bool warp_waiting_at_barrier(unsigned warp_id) const = 0; virtual void checkExecutionStatusAndUpdate(warp_inst_t &inst, unsigned t, unsigned tid) = 0; - class gpgpu_sim *get_gpu() { - return m_gpu; - } + class gpgpu_sim *get_gpu() { return m_gpu; } void execute_warp_inst_t(warp_inst_t &inst, unsigned warpId = (unsigned)-1); bool ptx_thread_done(unsigned hw_thread_id) const; virtual void updateSIMTStack(unsigned warpId, warp_inst_t *inst); @@ -1348,9 +1337,7 @@ class core_t { void get_pdom_stack_top_info(unsigned warpId, unsigned *pc, unsigned *rpc) const; kernel_info_t *get_kernel_info() { return m_kernel; } - class ptx_thread_info **get_thread_info() { - return m_thread; - } + class ptx_thread_info **get_thread_info() { return m_thread; } unsigned get_warp_size() const { return m_warp_size; } void and_reduction(unsigned ctaid, unsigned barid, bool value) { reduction_storage[ctaid][barid] &= value; diff --git a/src/accelwattch/XML_Parse.h b/src/accelwattch/XML_Parse.h index 176b82f6e..7017c9002 100644 --- a/src/accelwattch/XML_Parse.h +++ b/src/accelwattch/XML_Parse.h @@ -40,9 +40,9 @@ #ifndef XML_PARSE_H_ #define XML_PARSE_H_ -//#ifdef WIN32 -//#define _CRT_SECURE_NO_DEPRECATE -//#endif +// #ifdef WIN32 +// #define _CRT_SECURE_NO_DEPRECATE +// #endif #include #include diff --git a/src/accelwattch/core.cc b/src/accelwattch/core.cc index cbaefc7ff..fe11e058e 100644 --- a/src/accelwattch/core.cc +++ b/src/accelwattch/core.cc @@ -47,38 +47,39 @@ #include "const.h" #include "io.h" #include "parameter.h" -//#include "globalvar.h" -// double exClockRate; +// #include "globalvar.h" +// double exClockRate; //********************* -// Operand collector (OC) modelling (Syed Gilani) +// Operand collector (OC) modelling (Syed Gilani) //********************* -// The OCs are modelled similar to the GPGPU-Sim v3.x documentation and -// nVIDIA patents. -// the OC need the following GPGPU-Sim config options: +// The OCs are modelled similar to the GPGPU-Sim v3.x documentation and +// nVIDIA patents. +// the OC need the following GPGPU-Sim config options: //-gpgpu_num_reg_banks 8 # Number of register banks (default //= 8) -gpgpu_reg_bank_use_warp_id 0 # Use warp ID in mapping -// registers to banks (default = off) -gpgpu_operand_collector_num_units_sp 6 # -// number of collector units (default = 4) -// -gpgpu_operand_collector_num_units_sfu 8 # number of collector units (default -// = 4) -gpgpu_operand_collector_num_units_mem 2 # number of -// collector units (default = 2) -gpgpu_operand_collector_num_units_gen 0 # -// number of collector units (default = 0) +// registers to banks (default = off) -gpgpu_operand_collector_num_units_sp 6 # +// number of collector units (default = 4) +// -gpgpu_operand_collector_num_units_sfu 8 # number of collector units +// (default = 4) -gpgpu_operand_collector_num_units_mem 2 # +// number of collector units (default = 2) +// -gpgpu_operand_collector_num_units_gen 0 # number of collector units +// (default = 0) //-gpgpu_operand_collector_num_in_ports_sp 1 # number of -// collector unit in ports (default = 1) +// collector unit in ports (default = 1) //-gpgpu_operand_collector_num_in_ports_sfu 1 # number of -// collector unit in ports (default = 1) +// collector unit in ports (default = 1) //-gpgpu_operand_collector_num_in_ports_mem 1 # number of -// collector unit in ports (default = 1) +// collector unit in ports (default = 1) //-gpgpu_operand_collector_num_in_ports_gen 0 # number of -// collector unit in ports (default = 0) +// collector unit in ports (default = 0) //-gpgpu_operand_collector_num_out_ports_sp 1 # number of -// collector unit in ports (default = 1) +// collector unit in ports (default = 1) //-gpgpu_operand_collector_num_out_ports_sfu 1 # number of -// collector unit in ports (default = 1) +// collector unit in ports (default = 1) //-gpgpu_operand_collector_num_out_ports_mem 1 # number of -// collector unit in ports (default = 1) +// collector unit in ports (default = 1) //-gpgpu_operand_collector_num_out_ports_gen 0 # number of -// collector unit in ports (default = 0) +// collector unit in ports (default = 0) // The total number of collector units and their input ports, and the number of // register file banks determine the crossbar size. @@ -1837,7 +1838,7 @@ MemManU::MemManU(ParseXML* XML_interface, int ithCore_, area.set_area(area.get_area() + dtlb->local_result.area); // output_data_csv(dtlb.tlb.local_result); } -//#define FERMI +// #define FERMI RegFU::RegFU(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam& dyn_p_, @@ -2194,7 +2195,7 @@ EXECU::EXECU(ParseXML* XML_interface, int ithCore_, bypass.area.set_area(bypass.area.get_area() +fpTagBypass->area.get_area()); }*/ - } /* if (coredynp.core_ty==Inorder) */ + } /* if (coredynp.core_ty==Inorder) */ else { // OOO if (coredynp.scheu_ty == PhysicalRegFile) { /* For physical register based OOO, diff --git a/src/accelwattch/iocontrollers.h b/src/accelwattch/iocontrollers.h index 9d6c48a22..22df1fb0b 100644 --- a/src/accelwattch/iocontrollers.h +++ b/src/accelwattch/iocontrollers.h @@ -35,9 +35,9 @@ #include "XML_Parse.h" #include "cacti/parameter.h" -//#include "io.h" +// #include "io.h" #include "array.h" -//#include "Undifferentiated_Core_Area.h" +// #include "Undifferentiated_Core_Area.h" #include #include "basic_components.h" diff --git a/src/accelwattch/logic.cc b/src/accelwattch/logic.cc index 7f401895c..3d3a1a463 100644 --- a/src/accelwattch/logic.cc +++ b/src/accelwattch/logic.cc @@ -656,7 +656,7 @@ FunctionalUnit::FunctionalUnit(ParseXML *XML_interface, int ithCore_, } per_access_energy *= 0.5; // According to ARM data embedded processor has // much lower per acc energy - } /* if (XML->sys.Embedded) */ + } /* if (XML->sys.Embedded) */ else { if (fu_type == FPU) { num_fu = coredynp.num_fpus; diff --git a/src/accelwattch/memoryctrl.h b/src/accelwattch/memoryctrl.h index 4ac55fc73..f065c0aeb 100644 --- a/src/accelwattch/memoryctrl.h +++ b/src/accelwattch/memoryctrl.h @@ -41,9 +41,9 @@ #include "XML_Parse.h" #include "cacti/parameter.h" -//#include "io.h" +// #include "io.h" #include "array.h" -//#include "Undifferentiated_Core_Area.h" +// #include "Undifferentiated_Core_Area.h" #include #include "basic_components.h" diff --git a/src/accelwattch/processor.cc b/src/accelwattch/processor.cc index d5c7cdda8..a86b96dc9 100644 --- a/src/accelwattch/processor.cc +++ b/src/accelwattch/processor.cc @@ -664,24 +664,22 @@ void Processor::displayDeviceType(int device_type_, uint32_t indent) { switch (device_type_) { case 0: - cout << indent_str << "Device Type= " - << "ITRS high performance device type" << endl; + cout << indent_str + << "Device Type= " << "ITRS high performance device type" << endl; break; case 1: - cout << indent_str << "Device Type= " - << "ITRS low standby power device type" << endl; + cout << indent_str + << "Device Type= " << "ITRS low standby power device type" << endl; break; case 2: - cout << indent_str << "Device Type= " - << "ITRS low operating power device type" << endl; + cout << indent_str + << "Device Type= " << "ITRS low operating power device type" << endl; break; case 3: - cout << indent_str << "Device Type= " - << "LP-DRAM device type" << endl; + cout << indent_str << "Device Type= " << "LP-DRAM device type" << endl; break; case 4: - cout << indent_str << "Device Type= " - << "COMM-DRAM device type" << endl; + cout << indent_str << "Device Type= " << "COMM-DRAM device type" << endl; break; default: { cout << indent_str << "Unknown Device Type" << endl; diff --git a/src/accelwattch/xmlParser.cc b/src/accelwattch/xmlParser.cc index 780d2ad04..9f01ebe7e 100644 --- a/src/accelwattch/xmlParser.cc +++ b/src/accelwattch/xmlParser.cc @@ -85,10 +85,10 @@ #endif #include "xmlParser.h" #ifdef _XMLWINDOWS -//#ifdef _DEBUG -//#define _CRTDBG_MAP_ALLOC -//#include -//#endif +// #ifdef _DEBUG +// #define _CRTDBG_MAP_ALLOC +// #include +// #endif #define WIN32_LEAN_AND_MEAN #include // to have IsTextUnicode, MultiByteToWideChar, WideCharToMultiByte to handle unicode files // to have "MessageBoxA" to display error messages for openFilHelper @@ -3241,7 +3241,7 @@ XMLSTR XMLParserBase64Tool::encode(unsigned char *inbuf, unsigned int inlen, *(curr++) = base64EncodeTable[j >> 18]; *(curr++) = base64EncodeTable[(j >> 12) & 0x3f]; *(curr++) = base64EncodeTable[(j >> 6) & 0x3f]; - *(curr++) = base64EncodeTable[(j)&0x3f]; + *(curr++) = base64EncodeTable[(j) & 0x3f]; if (formatted) { if (!k) { *(curr++) = _CXML('\n'); diff --git a/src/accelwattch/xmlParser.h b/src/accelwattch/xmlParser.h index 71a1f5722..b5c077972 100644 --- a/src/accelwattch/xmlParser.h +++ b/src/accelwattch/xmlParser.h @@ -163,7 +163,7 @@ // uncomment the next line if you want no support for wchar_t* (no need for the // or libraries anymore to compile) -//#define XML_NO_WIDE_CHAR +// #define XML_NO_WIDE_CHAR #ifdef XML_NO_WIDE_CHAR #undef _XMLWINDOWS diff --git a/src/cuda-sim/cuda-sim.cc b/src/cuda-sim/cuda-sim.cc index d05549cc3..69d1eb74f 100644 --- a/src/cuda-sim/cuda-sim.cc +++ b/src/cuda-sim/cuda-sim.cc @@ -2782,9 +2782,7 @@ void print_ptxinfo() { } } -struct gpgpu_ptx_sim_info get_ptxinfo() { - return g_ptxinfo; -} +struct gpgpu_ptx_sim_info get_ptxinfo() { return g_ptxinfo; } std::map get_duplicate() { return g_duplicate; } @@ -2801,6 +2799,8 @@ void ptxinfo_function(const char *fname) { void ptxinfo_regs(unsigned nregs) { g_ptxinfo.regs = nregs; } +void ptxinfo_barriers(unsigned barriers) { g_ptxinfo.barriers = barriers; } + void ptxinfo_lmem(unsigned declared, unsigned system) { g_ptxinfo.lmem = declared + system; } diff --git a/src/cuda-sim/cuda-sim.h b/src/cuda-sim/cuda-sim.h index 21e1ca058..b1caf0c64 100644 --- a/src/cuda-sim/cuda-sim.h +++ b/src/cuda-sim/cuda-sim.h @@ -101,8 +101,8 @@ class functionalCoreSim : public core_t { bool *m_warpAtBarrier; }; -#define RECONVERGE_RETURN_PC ((address_type)-2) -#define NO_BRANCH_DIVERGENCE ((address_type)-1) +#define RECONVERGE_RETURN_PC ((address_type) - 2) +#define NO_BRANCH_DIVERGENCE ((address_type) - 1) address_type get_return_pc(void *thd); const char *get_ptxinfo_kname(); void print_ptxinfo(); diff --git a/src/cuda-sim/half.h b/src/cuda-sim/half.h index fab1a229b..67e607b5d 100644 --- a/src/cuda-sim/half.h +++ b/src/cuda-sim/half.h @@ -846,10 +846,8 @@ uint16 int2half_impl(T value) { bits |= 0x7BFF + (R != std::round_toward_zero); } else if (value) { unsigned int m = value, exp = 24; - for (; m < 0x400; m <<= 1, --exp) - ; - for (; m > 0x7FF; m >>= 1, ++exp) - ; + for (; m < 0x400; m <<= 1, --exp); + for (; m > 0x7FF; m >>= 1, ++exp); bits |= (exp << 10) + m; if (exp > 24) { if (R == std::round_to_nearest) @@ -1274,8 +1272,7 @@ inline double half2float_impl(uint16 value, double, true_type) { int abs = value & 0x7FFF; if (abs) { hi |= 0x3F000000 << static_cast(abs >= 0x7C00); - for (; abs < 0x400; abs <<= 1, hi -= 0x100000) - ; + for (; abs < 0x400; abs <<= 1, hi -= 0x100000); hi += static_cast(abs) << 10; } uint64 bits = static_cast(hi) << 32; @@ -2116,8 +2113,7 @@ struct functions { static half frexp(half arg, int *exp) { int m = arg.data_ & 0x7FFF, e = -14; if (m >= 0x7C00 || !m) return *exp = 0, arg; - for (; m < 0x400; m <<= 1, --e) - ; + for (; m < 0x400; m <<= 1, --e); return *exp = e + (m >> 10), half(binary, (arg.data_ & 0x8000) | 0x3800 | (m & 0x3FF)); } @@ -2135,8 +2131,7 @@ struct functions { unsigned int mask = (1 << (25 - e)) - 1, m = arg.data_ & mask; iptr->data_ = arg.data_ & ~mask; if (!m) return half(binary, arg.data_ & 0x8000); - for (; m < 0x400; m <<= 1, --e) - ; + for (; m < 0x400; m <<= 1, --e); return half(binary, static_cast((arg.data_ & 0x8000) | (e << 10) | (m & 0x3FF))); } @@ -2148,8 +2143,7 @@ struct functions { static half scalbln(half arg, long exp) { unsigned int m = arg.data_ & 0x7FFF; if (m >= 0x7C00 || !m) return arg; - for (; m < 0x400; m <<= 1, --exp) - ; + for (; m < 0x400; m <<= 1, --exp); exp += m >> 10; uint16 value = arg.data_ & 0x8000; if (exp > 30) { @@ -2191,8 +2185,7 @@ struct functions { if (abs < 0x7C00) { int exp = (abs >> 10) - 15; if (abs < 0x400) - for (; abs < 0x200; abs <<= 1, --exp) - ; + for (; abs < 0x200; abs <<= 1, --exp); return exp; } if (abs > 0x7C00) return FP_ILOGBNAN; @@ -2208,13 +2201,11 @@ struct functions { if (abs < 0x7C00) { int exp = (abs >> 10) - 15; if (abs < 0x400) - for (; abs < 0x200; abs <<= 1, --exp) - ; + for (; abs < 0x200; abs <<= 1, --exp); uint16 bits = (exp < 0) << 15; if (exp) { unsigned int m = std::abs(exp) << 6, e = 18; - for (; m < 0x400; m <<= 1, --e) - ; + for (; m < 0x400; m <<= 1, --e); bits |= (e << 10) + m; } return half(binary, bits); diff --git a/src/cuda-sim/ptx_ir.h b/src/cuda-sim/ptx_ir.h index b08a692d8..4a7d39b66 100644 --- a/src/cuda-sim/ptx_ir.h +++ b/src/cuda-sim/ptx_ir.h @@ -39,7 +39,7 @@ #include #include -//#include "ptx.tab.h" +// #include "ptx.tab.h" #include "ptx_sim.h" #include "memory.h" diff --git a/src/cuda-sim/ptx_sim.h b/src/cuda-sim/ptx_sim.h index 8eec922e4..7128f8e4e 100644 --- a/src/cuda-sim/ptx_sim.h +++ b/src/cuda-sim/ptx_sim.h @@ -340,9 +340,7 @@ class ptx_thread_info { dim3 get_ctaid() const { return m_ctaid; } dim3 get_tid() const { return m_tid; } dim3 get_ntid() const { return m_ntid; } - class gpgpu_sim *get_gpu() { - return (gpgpu_sim *)m_gpu; - } + class gpgpu_sim *get_gpu() { return (gpgpu_sim *)m_gpu; } unsigned get_hw_tid() const { return m_hw_tid; } unsigned get_hw_ctaid() const { return m_hw_ctaid; } unsigned get_hw_wid() const { return m_hw_wid; } diff --git a/src/cuda-sim/ptxinfo.l b/src/cuda-sim/ptxinfo.l index 51371e321..db81bca64 100644 --- a/src/cuda-sim/ptxinfo.l +++ b/src/cuda-sim/ptxinfo.l @@ -58,6 +58,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "Compiling entry function" TC; return FUNC; "Used" TC; return USED; "registers" TC; return REGS; +"used" TC; return USED; +"barriers" TC; return REGS; "bytes" TC; return BYTES; "lmem" TC; return LMEM; "smem" TC; return SMEM; @@ -65,11 +67,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "gmem" TC; return GMEM; "line" TC; return LINE; "for" TC; return FOR; +"ms" TC; return MS; "textures" TC; return TEXTURES; "error : Duplicate definition of" TC; return DUPLICATE; "function" TC; yylval->string_value = strdup(yytext); return FUNCTION; "variable" TC; yylval->string_value = strdup(yytext); return VARIABLE; "fatal : Ptx assembly aborted due to errors" TC; return FATAL; +"Compile time = " TC; return COMPILETIME; [_A-Za-z$%][_0-9A-Za-z$]* TC; yylval->string_value = strdup(yytext); return IDENTIFIER; [-]{0,1}[0-9]+ TC; yylval->int_value = atoi(yytext); return INT_OPERAND; @@ -79,6 +83,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "[" TC; return LEFT_SQUARE_BRACKET; "]" TC; return RIGHT_SQUARE_BRACKET; ":" TC; return COLON; +"." TC; return PERIOD; ";" TC; return SEMICOLON; "'" TC; return QUOTE; " " TC; diff --git a/src/cuda-sim/ptxinfo.y b/src/cuda-sim/ptxinfo.y index b30395834..722520761 100644 --- a/src/cuda-sim/ptxinfo.y +++ b/src/cuda-sim/ptxinfo.y @@ -49,6 +49,8 @@ typedef void * yyscan_t; %token FUNC %token USED %token REGS +%token BARRIERS +%token COMPILETIME %token BYTES %token LMEM %token SMEM @@ -70,6 +72,8 @@ typedef void * yyscan_t; %token FUNCTION %token VARIABLE %token FATAL +%token PERIOD +%token MS %{ #include @@ -81,6 +85,7 @@ typedef void * yyscan_t; void yyerror(yyscan_t yyscanner, ptxinfo_data* ptxinfo, const char* msg); void ptxinfo_function(const char *fname ); void ptxinfo_regs( unsigned nregs ); + void ptxinfo_barriers( unsigned barriers ); void ptxinfo_lmem( unsigned declared, unsigned system ); void ptxinfo_gmem( unsigned declared, unsigned system ); void ptxinfo_smem( unsigned declared, unsigned system ); @@ -126,8 +131,10 @@ info: USED INT_OPERAND REGS { ptxinfo_regs($2); } | INT_OPERAND BYTES LMEM { ptxinfo_lmem($1,0); } | INT_OPERAND BYTES SMEM { ptxinfo_smem($1,0); } | INT_OPERAND BYTES CMEM { ptxinfo_cmem($1,0); } + | USED INT_OPERAND BARRIERS { ptxinfo_barriers($2); } | INT_OPERAND REGS { ptxinfo_regs($1); } | INT_OPERAND TEXTURES {} + | COMPILETIME INT_OPERAND PERIOD INT_OPERAND MS {} ; tuple: INT_OPERAND PLUS INT_OPERAND BYTES { g_declared=$1; g_system=$3; } diff --git a/src/gpgpu-sim/dram.cc b/src/gpgpu-sim/dram.cc index 80e20d795..1e1ad3d02 100644 --- a/src/gpgpu-sim/dram.cc +++ b/src/gpgpu-sim/dram.cc @@ -684,13 +684,9 @@ bool dram_t::issue_row_command(int j) { } // if mrq is being serviced by dram, gets popped after CL latency fulfilled -class mem_fetch *dram_t::return_queue_pop() { - return returnq->pop(); -} +class mem_fetch *dram_t::return_queue_pop() { return returnq->pop(); } -class mem_fetch *dram_t::return_queue_top() { - return returnq->top(); -} +class mem_fetch *dram_t::return_queue_top() { return returnq->top(); } void dram_t::print(FILE *simFile) const { unsigned i; diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc index 55d70d115..3f84d42fc 100644 --- a/src/gpgpu-sim/gpu-sim.cc +++ b/src/gpgpu-sim/gpu-sim.cc @@ -247,12 +247,12 @@ void memory_config::reg_options(class OptionParser *opp) { option_parser_register(opp, "-l2_ideal", OPT_BOOL, &l2_ideal, "Use a ideal L2 cache that always hit", "0"); - option_parser_register(opp, "-gpgpu_cache:dl2", OPT_CSTR, - &m_L2_config.m_config_string, - "unified banked L2 data cache config " - " {::,:::,::,}", - "64:128:8,L:B:m:N,A:16:4,4"); + option_parser_register( + opp, "-gpgpu_cache:dl2", OPT_CSTR, &m_L2_config.m_config_string, + "unified banked L2 data cache config " + " {:::,::::,::,:,", + "S:32:128:24,L:B:m:L:P,A:192:4,32:0,32"); option_parser_register(opp, "-gpgpu_cache:dl2_texture_only", OPT_BOOL, &m_L2_texure_only, "L2 cache used for texture only", "1"); @@ -344,17 +344,18 @@ void shader_core_config::reg_options(class OptionParser *opp) { " {::,:::,::<" "merge>,} ", "64:64:2,L:R:f:N,A:2:32,4"); - option_parser_register(opp, "-gpgpu_cache:il1", OPT_CSTR, - &m_L1I_config.m_config_string, - "shader L1 instruction cache config " - " {::,:::,::,} ", - "4:256:4,L:R:f:N,A:2:32,4"); + option_parser_register( + opp, "-gpgpu_cache:il1", OPT_CSTR, &m_L1I_config.m_config_string, + "shader L1 instruction cache config " + " {:::,::::,::,} ", + "N:64:128:16,L:R:f:N:L,S:2:48,4"); option_parser_register(opp, "-gpgpu_cache:dl1", OPT_CSTR, &m_L1D_config.m_config_string, "per-shader L1 data cache config " - " {::,:::,::, | none}", + " {:::,::<" + "alloc>::,::,<" + "mq>:, | none}", "none"); option_parser_register(opp, "-gpgpu_l1_cache_write_ratio", OPT_UINT32, &m_L1D_config.m_wr_percent, "L1D write ratio", "0"); @@ -374,14 +375,16 @@ void shader_core_config::reg_options(class OptionParser *opp) { option_parser_register(opp, "-gpgpu_cache:dl1PrefL1", OPT_CSTR, &m_L1D_config.m_config_stringPrefL1, "per-shader L1 data cache config " - " {::,:::,::, | none}", + " {:::,::<" + "alloc>::,::,<" + "mq>:, | none}", "none"); option_parser_register(opp, "-gpgpu_cache:dl1PrefShared", OPT_CSTR, &m_L1D_config.m_config_stringPrefShared, "per-shader L1 data cache config " - " {::,:::,::, | none}", + " {:::,::<" + "alloc>::,::,<" + "mq>:, | none}", "none"); option_parser_register(opp, "-gpgpu_gmem_skip_L1D", OPT_BOOL, &gmem_skip_L1D, "global memory access skip L1D cache (implements " diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h index c1f50961f..68bdca72e 100644 --- a/src/gpgpu-sim/gpu-sim.h +++ b/src/gpgpu-sim/gpu-sim.h @@ -32,11 +32,11 @@ #ifndef GPU_SIM_H #define GPU_SIM_H +#include #include #include #include #include -#include #include "../abstract_hardware_model.h" #include "../option_parser.h" #include "../trace.h" @@ -879,7 +879,7 @@ class sst_gpgpu_sim : public gpgpu_sim { * @param dst_start_addr * @param count */ - void perf_memcpy_to_gpu(size_t dst_start_addr, size_t count){}; + void perf_memcpy_to_gpu(size_t dst_start_addr, size_t count) {}; /** * @brief Check if the SST config matches up with the diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h index 65c9c38b3..9d164d766 100644 --- a/src/gpgpu-sim/l2cache.h +++ b/src/gpgpu-sim/l2cache.h @@ -106,9 +106,7 @@ class memory_partition_unit { unsigned get_mpid() const { return m_id; } - class gpgpu_sim *get_mgpu() const { - return m_gpu; - } + class gpgpu_sim *get_mgpu() const { return m_gpu; } private: unsigned m_id; diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h index ee10af664..0840d3f76 100644 --- a/src/gpgpu-sim/shader.h +++ b/src/gpgpu-sim/shader.h @@ -46,7 +46,7 @@ #include #include -//#include "../cuda-sim/ptx.tab.h" +// #include "../cuda-sim/ptx.tab.h" #include "../abstract_hardware_model.h" #include "delayqueue.h" @@ -273,9 +273,7 @@ class shd_warp_t { unsigned get_dynamic_warp_id() const { return m_dynamic_warp_id; } unsigned get_warp_id() const { return m_warp_id; } - class shader_core_ctx *get_shader() { - return m_shader; - } + class shader_core_ctx *get_shader() { return m_shader; } private: static const unsigned IBUFFER_SIZE = 2; diff --git a/src/gpgpu-sim/visualizer.cc b/src/gpgpu-sim/visualizer.cc index a832d6119..022201984 100644 --- a/src/gpgpu-sim/visualizer.cc +++ b/src/gpgpu-sim/visualizer.cc @@ -34,7 +34,7 @@ #include "mem_latency_stat.h" #include "power_stat.h" #include "shader.h" -//#include "../../../mcpat/processor.h" +// #include "../../../mcpat/processor.h" #include "gpu-cache.h" #include "stat-tool.h" diff --git a/src/gpgpusim_entrypoint.cc b/src/gpgpusim_entrypoint.cc index e2b711ede..be492295f 100644 --- a/src/gpgpusim_entrypoint.cc +++ b/src/gpgpusim_entrypoint.cc @@ -56,7 +56,9 @@ class stream_manager *g_stream_manager() { // SST callback extern void SST_callback_cudaThreadSynchronize_done(); +extern void SST_callback_cudaStreamSynchronize_done(cudaStream_t stream); __attribute__((weak)) void SST_callback_cudaThreadSynchronize_done() {} +__attribute__((weak)) void SST_callback_cudaStreamSynchronize_done(cudaStream_t stream) {} void *gpgpu_sim_thread_sequential(void *ctx_ptr) { gpgpu_context *ctx = (gpgpu_context *)ctx_ptr; @@ -100,8 +102,7 @@ void *gpgpu_sim_thread_concurrent(void *ctx_ptr) { fflush(stdout); } while (ctx->the_gpgpusim->g_stream_manager->empty_protected() && - !ctx->the_gpgpusim->g_sim_done) - ; + !ctx->the_gpgpusim->g_sim_done); if (g_debug_execution >= 3) { printf("GPGPU-Sim: ** START simulation thread (detected work) **\n"); ctx->the_gpgpusim->g_stream_manager->print(stdout); @@ -190,12 +191,33 @@ bool SST_Cycle() { // Check if Synchronize is done when SST previously requested // cudaThreadSynchronize if (GPGPU_Context()->requested_synchronize && - ((g_stream_manager()->empty() && !GPGPUsim_ctx_ptr()->g_sim_active) || + ((g_stream_manager()->empty_protected() && !GPGPUsim_ctx_ptr()->g_sim_active) || GPGPUsim_ctx_ptr()->g_sim_done)) { SST_callback_cudaThreadSynchronize_done(); GPGPU_Context()->requested_synchronize = false; } + // Polling to check for each stream if it is marked for requested with sync + if (g_stream_manager()->get_stream_zero()->requested_synchronize() && + ((g_stream_manager()->empty_protected() && !GPGPUsim_ctx_ptr()->g_sim_active) || + GPGPUsim_ctx_ptr()->g_sim_done)) { + SST_callback_cudaStreamSynchronize_done(0); + g_stream_manager()->get_stream_zero()->reset_request_synchronize(); + } + + // Iterate through each stream to check if SST is waiting on + // it and it does not have any operation + std::list& streams = g_stream_manager()->get_concurrent_streams(); + for (auto it = streams.begin(); it != streams.end(); it++) { + CUstream_st *stream = *it; + if (stream->requested_synchronize() && + stream->empty()) { + // This stream is ready + SST_callback_cudaStreamSynchronize_done(stream); + stream->reset_request_synchronize(); + } + } + if (g_stream_manager()->empty_protected() && !GPGPUsim_ctx_ptr()->g_sim_done && !g_the_gpu()->active()) { GPGPUsim_ctx_ptr()->g_sim_active = false; @@ -273,7 +295,6 @@ void gpgpu_context::synchronize() { bool gpgpu_context::synchronize_check() { // printf("GPGPU-Sim: synchronize checking for inactive GPU simulation\n"); - requested_synchronize = true; the_gpgpusim->g_stream_manager->print(stdout); fflush(stdout); // sem_wait(&g_sim_signal_finish); diff --git a/src/stream_manager.cc b/src/stream_manager.cc index 58c2ec4b5..d43964a8f 100644 --- a/src/stream_manager.cc +++ b/src/stream_manager.cc @@ -34,15 +34,20 @@ unsigned CUstream_st::sm_next_stream_uid = 0; -// SST memcpy callbacks -extern void SST_callback_memcpy_H2D_done(); -extern void SST_callback_memcpy_D2H_done(); +// SST memcpy callbacks, called after a stream operation is done via record_next_done() +extern void SST_callback_memcpy_H2D_done(uint64_t dst, uint64_t src, size_t count, cudaStream_t stream); +extern void SST_callback_memcpy_D2H_done(uint64_t dst, uint64_t src, size_t count, cudaStream_t stream); extern void SST_callback_memcpy_to_symbol_done(); extern void SST_callback_memcpy_from_symbol_done(); -__attribute__((weak)) void SST_callback_memcpy_H2D_done() {} -__attribute__((weak)) void SST_callback_memcpy_D2H_done() {} +extern void SST_callback_cudaEventSynchronize_done(cudaEvent_t event); +extern void SST_callback_kernel_done(cudaStream_t stream); +__attribute__((weak)) void SST_callback_memcpy_H2D_done(uint64_t dst, uint64_t src, size_t count, cudaStream_t stream) {} +__attribute__((weak)) void SST_callback_memcpy_D2H_done(uint64_t dst, uint64_t src, size_t count, cudaStream_t stream) {} __attribute__((weak)) void SST_callback_memcpy_to_symbol_done() {} __attribute__((weak)) void SST_callback_memcpy_from_symbol_done() {} +__attribute__((weak)) void SST_callback_cudaEventSynchronize_done(cudaEvent_t event); +__attribute__((weak)) void SST_callback_kernel_done(cudaStream_t stream); + CUstream_st::CUstream_st() { m_pending = false; @@ -74,6 +79,10 @@ void CUstream_st::synchronize() { } while (!done); } +bool CUstream_st::synchronize_check() { + return m_operations.empty(); +} + void CUstream_st::push(const stream_operation &op) { // called by host thread pthread_mutex_lock(&m_lock); @@ -132,13 +141,15 @@ bool stream_operation::do_operation(gpgpu_sim *gpu) { if (g_debug_execution >= 3) printf("memcpy host-to-device\n"); gpu->memcpy_to_gpu(m_device_address_dst, m_host_address_src, m_cnt); m_stream->record_next_done(); - if (gpu->is_SST_mode()) SST_callback_memcpy_H2D_done(); + if (gpu->is_SST_mode()) { + SST_callback_memcpy_H2D_done((uint64_t) m_device_address_dst, (uint64_t) m_host_address_src, m_cnt, m_stream->is_stream_zero_stream() ? 0 : m_stream); + } break; case stream_memcpy_device_to_host: if (g_debug_execution >= 3) printf("memcpy device-to-host\n"); gpu->memcpy_from_gpu(m_host_address_dst, m_device_address_src, m_cnt); m_stream->record_next_done(); - if (gpu->is_SST_mode()) SST_callback_memcpy_D2H_done(); + if (gpu->is_SST_mode()) SST_callback_memcpy_D2H_done((uint64_t) m_host_address_dst, (uint64_t) m_device_address_src, m_cnt, m_stream->is_stream_zero_stream() ? 0 : m_stream); break; case stream_memcpy_device_to_device: if (g_debug_execution >= 3) printf("memcpy device-to-device\n"); @@ -194,6 +205,13 @@ bool stream_operation::do_operation(gpgpu_sim *gpu) { time_t wallclock = time((time_t *)NULL); m_event->update(gpu->gpu_tot_sim_cycle, wallclock); m_stream->record_next_done(); + if ((gpu->is_SST_mode()) && m_event->done() && + m_event->requested_synchronize()) { + // Notify that the event is done + SST_callback_cudaEventSynchronize_done(m_event); + // Reset the sync flag as we have notified SST + m_event->reset_request_synchronize(); + } } break; case stream_wait_event: // only allows next op to go if event is done @@ -252,6 +270,9 @@ stream_manager::stream_manager(gpgpu_sim *gpu, bool cuda_launch_blocking) { m_cuda_launch_blocking = cuda_launch_blocking; pthread_mutex_init(&m_lock, NULL); m_last_stream = m_streams.begin(); + + // Mark stream zero as the default stream + m_stream_zero.set_stream_zero(); } bool stream_manager::operation(bool *sim) { @@ -303,6 +324,11 @@ bool stream_manager::register_finished_kernel(unsigned grid_uid) { // grid_uid, stream->get_uid()); kernel_stat.flush(); // kernel_stat.close(); stream->record_next_done(); + // Callback to notify a kernel is done for SST's stream + // manager to support with nonblocking + blocking kernel launch + if (m_gpu->is_SST_mode()) { + SST_callback_kernel_done(stream->is_stream_zero_stream() ? 0 : stream); + } m_grid_id_to_stream.erase(grid_uid); kernel->notify_parent_finished(); delete kernel; @@ -393,8 +419,7 @@ void stream_manager::add_stream(struct CUstream_st *stream) { void stream_manager::destroy_stream(CUstream_st *stream) { // called by host thread pthread_mutex_lock(&m_lock); - while (!stream->empty()) - ; + while (!stream->empty()); std::list::iterator s; for (s = m_streams.begin(); s != m_streams.end(); s++) { if (*s == stream) { diff --git a/src/stream_manager.h b/src/stream_manager.h index 561f54b87..55cfb8d28 100644 --- a/src/stream_manager.h +++ b/src/stream_manager.h @@ -69,6 +69,11 @@ struct CUevent_st { void issue() { m_issued++; } unsigned int num_issued() const { return m_issued; } + // SST related, stating this event is requested to synchronize + void set_request_synchronize() { m_requested_synchronize = true; } + void reset_request_synchronize() { m_requested_synchronize = false; } + bool requested_synchronize() const { return m_requested_synchronize; } + private: int m_uid; bool m_blocking; @@ -77,6 +82,9 @@ struct CUevent_st { unsigned int m_issued; time_t m_wallclock; double m_gpu_tot_sim_cycle; + + // SST related + bool m_requested_synchronize = false; static int m_next_event_uid; }; @@ -198,9 +206,7 @@ class stream_operation { kernel_info_t *get_kernel() { return m_kernel; } bool do_operation(gpgpu_sim *gpu); void print(FILE *fp) const; - struct CUstream_st *get_stream() { - return m_stream; - } + struct CUstream_st *get_stream() { return m_stream; } void set_stream(CUstream_st *stream) { m_stream = stream; } private: @@ -228,6 +234,7 @@ struct CUstream_st { bool empty(); bool busy(); void synchronize(); + bool synchronize_check(); void push(const stream_operation &op); void record_next_done(); stream_operation next(); @@ -235,6 +242,12 @@ struct CUstream_st { stream_operation &front() { return m_operations.front(); } void print(FILE *fp); unsigned get_uid() const { return m_uid; } + void set_request_synchronize() { m_requested_synchronize = true; } + void reset_request_synchronize() { m_requested_synchronize = false; } + bool requested_synchronize() const { return m_requested_synchronize; } + void set_stream_zero() { is_stream_zero = true; } + bool is_stream_zero_stream() { return is_stream_zero; } + void reset_stream_zero() { is_stream_zero = false; } private: unsigned m_uid; @@ -245,6 +258,11 @@ struct CUstream_st { pthread_mutex_t m_lock; // ensure only one host or gpu manipulates stream // operation at one time + + // SST related, use to record the stream is requested to synchronize + bool m_requested_synchronize = false; + // Whether this is the default stream + bool is_stream_zero = false; }; class stream_manager { @@ -265,6 +283,8 @@ class stream_manager { void stop_all_running_kernels(); unsigned size() { return m_streams.size(); }; bool is_blocking() { return m_cuda_launch_blocking; }; + CUstream_st *get_stream_zero() { return &m_stream_zero; }; + std::list& get_concurrent_streams() { return m_streams; }; private: void print_impl(FILE *fp);