gpgpu-sim · yechen3 · Feb 15, 2025 · Feb 18, 2025 · Feb 18, 2025 · Feb 22, 2025
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,4 @@
+{
+    "name": "CUDA 12.8",
+    "image": "ghcr.io/accel-sim/accel-sim-framework:Ubuntu-24.04-cuda-12.8"
+}
diff --git a/.devcontainer/sst_integration/devcontainer.json b/.devcontainer/sst_integration/devcontainer.json
@@ -0,0 +1,4 @@
+{
+    "name": "SST CUDA 11.7",
+    "image": "ghcr.io/accel-sim/accel-sim-framework:SST-Integration-Ubuntu-22.04-cuda-11.7-llvm-18.1.8-riscv-gnu-2024.08.06-nightly"
+}
diff --git a/.github/workflows/accelsim.yml b/.github/workflows/accelsim.yml
@@ -22,7 +22,7 @@ jobs:
   build-QV100:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
 
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
@@ -18,7 +18,7 @@ jobs:
   build-TITANV:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
       env:
         CONFIG: TITANV
 
@@ -32,7 +32,7 @@ jobs:
   build-TITANV-LOCALXBAR:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
       env:
         CONFIG: TITANV-LOCALXBAR
 
@@ -46,7 +46,7 @@ jobs:
   build-QV100:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
       env:
         CONFIG: QV100
 
@@ -60,7 +60,7 @@ jobs:
   build-2060:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
       env:
         CONFIG: RTX2060
 
@@ -74,7 +74,7 @@ jobs:
   build-3070:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
       env:
         CONFIG: RTX3070
 

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -18,7 +18,7 @@ jobs:
   build-TITANV:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
       env:
         CONFIG: TITANV
 
@@ -32,7 +32,7 @@ jobs:
   build-TITANV-LOCALXBAR:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
       env:
         CONFIG: TITANV-LOCALXBAR
 
@@ -46,7 +46,7 @@ jobs:
   build-QV100:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
       env:
         CONFIG: QV100
 
@@ -60,7 +60,7 @@ jobs:
   build-2060:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
       env:
         CONFIG: RTX2060
 
@@ -74,7 +74,7 @@ jobs:
   build-3070:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
       env:
         CONFIG: RTX3070
 

diff --git a/.gitignore b/.gitignore
@@ -56,9 +56,17 @@ debug_tools/WatchYourStep/ptxjitplus/gpgpu*
 debug_tools/WatchYourStep/ptxjitplus/*.old
 debug_tools/WatchYourStep/ptxjitplus/ptxjitplus
 debug_tools/WatchYourStep/ptxjitplus/*.ptx
+*.tmp
 
 # Accel-sim packages used for regressions
 accel-sim-framework/
 gpu-app-collection/
 
 setup
+
+# OS/IDE specific files
+.idea/
+.vscode/
+.DS_Store
+.DS_store
+__pycache__/
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -164,4 +164,8 @@ install(CODE "execute_process\(\
 install(CODE "execute_process\(\
     COMMAND ${CMAKE_COMMAND} -E create_symlink \
     ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
-    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.11.0\)")
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.11.0\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.12\)")
diff --git a/Makefile b/Makefile
@@ -169,6 +169,7 @@ $(SIM_LIB_DIR)/libcudart.so: makedirs $(LIBS) cudalib
 	if [ ! -f $(SIM_LIB_DIR)/libcudart.so.10.0 ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart.so.10.0; fi
 	if [ ! -f $(SIM_LIB_DIR)/libcudart.so.10.1 ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart.so.10.1; fi
 	if [ ! -f $(SIM_LIB_DIR)/libcudart.so.11.0 ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart.so.11.0; fi
+	if [ ! -f $(SIM_LIB_DIR)/libcudart.so.12 ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart.so.12; fi
 	if [ ! -f $(SIM_LIB_DIR)/libcudart_mod.so ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart_mod.so; fi
 
 $(SIM_LIB_DIR)/libcudart.dylib: makedirs $(LIBS) cudalib

diff --git a/bitbucket-pipelines.yml b/bitbucket-pipelines.yml
diff --git a/configs/tested-cfgs/SM2_GTX480/gpgpusim.config b/configs/tested-cfgs/SM2_GTX480/gpgpusim.config
@@ -56,8 +56,8 @@
 
 
 # In Fermi, the cache and shared memory can be configured to 16kb:48kb(default) or 48kb:16kb
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
-# ** Optional parameter - Required when mshr_type==Texture Fifo
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
+# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
 # Note: Hashing set index function (H) only applies to a set size of 32 or 64. 
 -gpgpu_cache:dl1  N:32:128:4,L:L:m:N:H,S:64:8,8
 -gpgpu_shmem_size 49152

diff --git a/configs/tested-cfgs/SM3_KEPLER_TITAN/gpgpusim.config b/configs/tested-cfgs/SM3_KEPLER_TITAN/gpgpusim.config
@@ -99,8 +99,8 @@
 # Greedy then oldest scheduler
 -gpgpu_scheduler gto
 
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
-# ** Optional parameter - Required when mshr_type==Texture Fifo
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
+# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
 # Note: Hashing set index function (H) only applies to a set size of 32 or 64. 
 # The defulat is to disable the L1 cache, unless cache modifieres are used
 -gpgpu_cache:dl1  S:4:128:32,L:L:s:N:L,A:256:8,16:0,32

diff --git a/configs/tested-cfgs/SM6_TITANX/gpgpusim.config b/configs/tested-cfgs/SM6_TITANX/gpgpusim.config
@@ -123,8 +123,8 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
-# ** Optional parameter - Required when mshr_type==Texture Fifo
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
+# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
 # Note: Hashing set index function (H) only applies to a set size of 32 or 64. 
 # The defulat is to disable the L1 cache, unless cache modifieres are used
 -gpgpu_l1_banks 2

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -83,8 +83,8 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
-# ** Optional parameter - Required when mshr_type==Texture Fifo
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
+# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
 # In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 -gpgpu_adaptive_cache_config 1

diff --git a/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config
@@ -128,8 +128,8 @@
 -gpgpu_num_reg_banks 16
 -gpgpu_reg_file_port_throughput 2
 
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
-# ** Optional parameter - Required when mshr_type==Texture Fifo
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
+# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
 -gpgpu_adaptive_cache_config 0
 -gpgpu_l1_banks 4
 -gpgpu_cache:dl1  S:1:128:512,L:L:s:N:L,A:256:8,16:0,32

diff --git a/configs/tested-cfgs/SM7_GV100/gpgpusim.config b/configs/tested-cfgs/SM7_GV100/gpgpusim.config
@@ -137,8 +137,8 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
-# ** Optional parameter - Required when mshr_type==Texture Fifo
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
+# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
 # Defualt config is 32KB DL1 and 96KB shared memory
 # In Volta, we assign the remaining shared memory to L1 cache 
 # if the assigned shd mem = 0, then L1 cache = 128KB

diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -137,8 +137,8 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
-# ** Optional parameter - Required when mshr_type==Texture Fifo
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
+# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
 # Defualt config is 32KB DL1 and 96KB shared memory
 # In Volta, we assign the remaining shared memory to L1 cache 
 # if the assigned shd mem = 0, then L1 cache = 128KB

diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
@@ -107,8 +107,8 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
-# ** Optional parameter - Required when mshr_type==Texture Fifo
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
+# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
 # Defualt config is 32KB DL1 and 96KB shared memory
 # In Volta, we assign the remaining shared memory to L1 cache 
 # if the assigned shd mem = 0, then L1 cache = 128KB

diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -83,8 +83,8 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
-# ** Optional parameter - Required when mshr_type==Texture Fifo
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
+# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
 # In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 -gpgpu_adaptive_cache_config 1

diff --git a/format-code.sh b/format-code.sh
@@ -1,13 +1,13 @@
 # This bash script formats GPGPU-Sim using clang-format
 THIS_DIR="$( cd "$( dirname "$BASH_SOURCE" )" && pwd )"
 echo "Running clang-format on $THIS_DIR"
-clang-format -i ${THIS_DIR}/libcuda/*.h
-clang-format -i ${THIS_DIR}/libcuda/*.cc
-clang-format -i ${THIS_DIR}/src/*.h
-clang-format -i ${THIS_DIR}/src/*.cc
-clang-format -i ${THIS_DIR}/src/gpgpu-sim/*.h
-clang-format -i ${THIS_DIR}/src/gpgpu-sim/*.cc
-clang-format -i ${THIS_DIR}/src/cuda-sim/*.h
-clang-format -i ${THIS_DIR}/src/cuda-sim/*.cc
-clang-format -i ${THIS_DIR}/src/accelwattch/*.h
-clang-format -i ${THIS_DIR}/src/accelwattch/*.cc
+clang-format -i ${THIS_DIR}/libcuda/*.h --style=file:${THIS_DIR}/.clang-format
+clang-format -i ${THIS_DIR}/libcuda/*.cc --style=file:${THIS_DIR}/.clang-format
+clang-format -i ${THIS_DIR}/src/*.h --style=file:${THIS_DIR}/.clang-format
+clang-format -i ${THIS_DIR}/src/*.cc --style=file:${THIS_DIR}/.clang-format
+clang-format -i ${THIS_DIR}/src/gpgpu-sim/*.h --style=file:${THIS_DIR}/.clang-format
+clang-format -i ${THIS_DIR}/src/gpgpu-sim/*.cc --style=file:${THIS_DIR}/.clang-format
+clang-format -i ${THIS_DIR}/src/cuda-sim/*.h --style=file:${THIS_DIR}/.clang-format
+clang-format -i ${THIS_DIR}/src/cuda-sim/*.cc --style=file:${THIS_DIR}/.clang-format
+clang-format -i ${THIS_DIR}/src/accelwattch/*.h --style=file:${THIS_DIR}/.clang-format
+clang-format -i ${THIS_DIR}/src/accelwattch/*.cc --style=file:${THIS_DIR}/.clang-format
diff --git a/gpgpusim_check.cmake b/gpgpusim_check.cmake
@@ -63,8 +63,8 @@ else()
     message(CHECK_PASS "${CUDAToolkit_NVCC_EXECUTABLE}")
     message(CHECK_START "Checking CUDA compiler version")
     message(CHECK_PASS "${CUDAToolkit_VERSION}")
-    if((CUDAToolkit_VERSION VERSION_LESS 2.0.3) OR (CUDAToolkit_VERSION VERSION_GREATER 11.10.0))
-        message(FATAL_ERROR "GPGPU-Sim ${CMAKE_PROJECT_VERSION} not tested with CUDA version ${CUDAToolkit_VERSION} (please see README)")
+    if((CUDAToolkit_VERSION VERSION_LESS 2.0.3) OR (CUDAToolkit_VERSION VERSION_GREATER 13.0.0))
+        message(WARNING "GPGPU-Sim not tested with CUDA version ${CUDAToolkit_VERSION} (please see README)")
     endif()
 endif()
 
@@ -132,4 +132,4 @@ list(POP_BACK CMAKE_MESSAGE_INDENT)
 message(CHECK_PASS "done")
 message(STATUS "Be sure to run 'source setup' "
                "before you run CUDA program with GPGPU-Sim or building with external "
-               "simulator like SST")
+               "simulator like SST")
diff --git a/libcuda/cuda_api.h b/libcuda/cuda_api.h
@@ -2607,12 +2607,12 @@ typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st {
 /**
  * Device that represents the CPU
  */
-#define CU_DEVICE_CPU ((CUdevice)-1)
+#define CU_DEVICE_CPU ((CUdevice) - 1)
 
 /**
  * Device that represents an invalid device
  */
-#define CU_DEVICE_INVALID ((CUdevice)-2)
+#define CU_DEVICE_INVALID ((CUdevice) - 2)
 
 /** @} */ /* END CUDA_TYPES */
 

diff --git a/libcuda/cuda_api_object.h b/libcuda/cuda_api_object.h
@@ -35,9 +35,7 @@ struct _cuda_device_id {
     m_next = NULL;
     m_gpgpu = gpu;
   }
-  struct _cuda_device_id *next() {
-    return m_next;
-  }
+  struct _cuda_device_id *next() { return m_next; }
   unsigned num_shader() const { return m_gpgpu->get_config().num_shader(); }
   int num_devices() const {
     if (m_next == NULL)
@@ -158,9 +156,7 @@ class kernel_config {
   void set_grid_dim(dim3 *d) { m_GridDim = *d; }
   void set_block_dim(dim3 *d) { m_BlockDim = *d; }
   gpgpu_ptx_sim_arg_list_t get_args() { return m_args; }
-  struct CUstream_st *get_stream() {
-    return m_stream;
-  }
+  struct CUstream_st *get_stream() { return m_stream; }
 
  private:
   dim3 m_GridDim;