diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 000000000..f5843f2be
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,4 @@
+{
+    "name": "CUDA 12.8",
+    "image": "ghcr.io/accel-sim/accel-sim-framework:Ubuntu-24.04-cuda-12.8"
+}
\ No newline at end of file
diff --git a/.devcontainer/sst_integration/devcontainer.json b/.devcontainer/sst_integration/devcontainer.json
new file mode 100644
index 000000000..e3732b4e3
--- /dev/null
+++ b/.devcontainer/sst_integration/devcontainer.json
@@ -0,0 +1,4 @@
+{
+    "name": "SST CUDA 11.7",
+    "image": "ghcr.io/accel-sim/accel-sim-framework:SST-Integration-Ubuntu-22.04-cuda-11.7-llvm-18.1.8-riscv-gnu-2024.08.06-nightly"
+}
\ No newline at end of file
diff --git a/.github/workflows/accelsim.yml b/.github/workflows/accelsim.yml
index 741035226..39ae46df0 100644
--- a/.github/workflows/accelsim.yml
+++ b/.github/workflows/accelsim.yml
@@ -22,7 +22,7 @@ jobs:
   build-QV100:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
         
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index c0a22ebf2..c37bd412b 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -18,7 +18,7 @@ jobs:
   build-TITANV:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
       env:
         CONFIG: TITANV
         
@@ -32,7 +32,7 @@ jobs:
   build-TITANV-LOCALXBAR:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
       env:
         CONFIG: TITANV-LOCALXBAR
         
@@ -46,7 +46,7 @@ jobs:
   build-QV100:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
       env:
         CONFIG: QV100
         
@@ -60,7 +60,7 @@ jobs:
   build-2060:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
       env:
         CONFIG: RTX2060
         
@@ -74,7 +74,7 @@ jobs:
   build-3070:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
       env:
         CONFIG: RTX3070
         
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 954b98faf..a4f838daf 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -18,7 +18,7 @@ jobs:
   build-TITANV:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
       env:
         CONFIG: TITANV
         
@@ -32,7 +32,7 @@ jobs:
   build-TITANV-LOCALXBAR:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
       env:
         CONFIG: TITANV-LOCALXBAR
         
@@ -46,7 +46,7 @@ jobs:
   build-QV100:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
       env:
         CONFIG: QV100
         
@@ -60,7 +60,7 @@ jobs:
   build-2060:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
       env:
         CONFIG: RTX2060
         
@@ -74,7 +74,7 @@ jobs:
   build-3070:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
       env:
         CONFIG: RTX3070
         
diff --git a/.gitignore b/.gitignore
index 340277af8..c27f9218d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -56,9 +56,17 @@ debug_tools/WatchYourStep/ptxjitplus/gpgpu*
 debug_tools/WatchYourStep/ptxjitplus/*.old
 debug_tools/WatchYourStep/ptxjitplus/ptxjitplus
 debug_tools/WatchYourStep/ptxjitplus/*.ptx
+*.tmp
 
 # Accel-sim packages used for regressions
 accel-sim-framework/
 gpu-app-collection/
 
 setup
+
+# OS/IDE specific files
+.idea/
+.vscode/
+.DS_Store
+.DS_store
+__pycache__/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 95ca8e085..2292e8b1b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -164,4 +164,8 @@ install(CODE "execute_process\(\
 install(CODE "execute_process\(\
     COMMAND ${CMAKE_COMMAND} -E create_symlink \
     ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
-    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.11.0\)")
\ No newline at end of file
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.11.0\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.12\)")
diff --git a/Makefile b/Makefile
index 37dba0146..35457c98b 100644
--- a/Makefile
+++ b/Makefile
@@ -169,6 +169,7 @@ $(SIM_LIB_DIR)/libcudart.so: makedirs $(LIBS) cudalib
 	if [ ! -f $(SIM_LIB_DIR)/libcudart.so.10.0 ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart.so.10.0; fi
 	if [ ! -f $(SIM_LIB_DIR)/libcudart.so.10.1 ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart.so.10.1; fi
 	if [ ! -f $(SIM_LIB_DIR)/libcudart.so.11.0 ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart.so.11.0; fi
+	if [ ! -f $(SIM_LIB_DIR)/libcudart.so.12 ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart.so.12; fi
 	if [ ! -f $(SIM_LIB_DIR)/libcudart_mod.so ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart_mod.so; fi
 
 $(SIM_LIB_DIR)/libcudart.dylib: makedirs $(LIBS) cudalib
diff --git a/bitbucket-pipelines.yml b/bitbucket-pipelines.yml
deleted file mode 100644
index 6e331d424..000000000
--- a/bitbucket-pipelines.yml
+++ /dev/null
@@ -1,15 +0,0 @@
-# This is a sample build configuration for C++ – Make.
-# Check our guides at https://confluence.atlassian.com/x/5Q4SMw for more examples.
-# Only use spaces to indent your .yml configuration.
-# -----
-# You can specify a custom docker image from Docker Hub as your build environment.
-image: tgrogers/gpgpu-sim_regress:latest
-
-pipelines:
-  default:
-    - step:
-        script: # Modify the commands below to build your repository.
-          - docker run -v `pwd`:/home/runner/gpgpu-sim_distribution:rw tgrogers/gpgpu-sim_regress:latest /bin/bash -c "./start_torque.sh; chown -R runner /home/runner/gpgpu-sim_distribution; su - runner -c 'source /home/runner/gpgpu-sim_distribution/setup_environment && make -j -C /home/runner/gpgpu-sim_distribution && cd /home/runner/gpgpu-sim_simulations/ && git pull && /home/runner/gpgpu-sim_simulations/util/job_launching/run_simulations.py -c /home/runner/gpgpu-sim_simulations/util/job_launching/regression_recipies/rodinia_2.0-ft/configs.gtx1080ti.yml -N regress && /home/runner/gpgpu-sim_simulations/util/job_launching/monitor_func_test.py -v -N regress'"
-        services:
-          - docker
-        
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM2_GTX480/gpgpusim.config b/configs/tested-cfgs/SM2_GTX480/gpgpusim.config
index bc01821db..47637286e 100644
--- a/configs/tested-cfgs/SM2_GTX480/gpgpusim.config
+++ b/configs/tested-cfgs/SM2_GTX480/gpgpusim.config
@@ -56,8 +56,8 @@
 
 
 # In Fermi, the cache and shared memory can be configured to 16kb:48kb(default) or 48kb:16kb
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
-# ** Optional parameter - Required when mshr_type==Texture Fifo
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
+# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
 # Note: Hashing set index function (H) only applies to a set size of 32 or 64. 
 -gpgpu_cache:dl1  N:32:128:4,L:L:m:N:H,S:64:8,8
 -gpgpu_shmem_size 49152
diff --git a/configs/tested-cfgs/SM3_KEPLER_TITAN/gpgpusim.config b/configs/tested-cfgs/SM3_KEPLER_TITAN/gpgpusim.config
index ef47ddfd9..9cb328f5d 100644
--- a/configs/tested-cfgs/SM3_KEPLER_TITAN/gpgpusim.config
+++ b/configs/tested-cfgs/SM3_KEPLER_TITAN/gpgpusim.config
@@ -99,8 +99,8 @@
 # Greedy then oldest scheduler
 -gpgpu_scheduler gto
 
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
-# ** Optional parameter - Required when mshr_type==Texture Fifo
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
+# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
 # Note: Hashing set index function (H) only applies to a set size of 32 or 64. 
 # The defulat is to disable the L1 cache, unless cache modifieres are used
 -gpgpu_cache:dl1  S:4:128:32,L:L:s:N:L,A:256:8,16:0,32
diff --git a/configs/tested-cfgs/SM6_TITANX/gpgpusim.config b/configs/tested-cfgs/SM6_TITANX/gpgpusim.config
index 7d3e2d47e..882630e76 100644
--- a/configs/tested-cfgs/SM6_TITANX/gpgpusim.config
+++ b/configs/tested-cfgs/SM6_TITANX/gpgpusim.config
@@ -123,8 +123,8 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
-# ** Optional parameter - Required when mshr_type==Texture Fifo
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
+# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
 # Note: Hashing set index function (H) only applies to a set size of 32 or 64. 
 # The defulat is to disable the L1 cache, unless cache modifieres are used
 -gpgpu_l1_banks 2
diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index 6ff4b6c08..8cc3ed6bf 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -83,8 +83,8 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
-# ** Optional parameter - Required when mshr_type==Texture Fifo
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
+# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
 # In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 -gpgpu_adaptive_cache_config 1
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config
index 08ac75277..290c08d6c 100644
--- a/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config
@@ -128,8 +128,8 @@
 -gpgpu_num_reg_banks 16
 -gpgpu_reg_file_port_throughput 2
 
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
-# ** Optional parameter - Required when mshr_type==Texture Fifo
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
+# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
 -gpgpu_adaptive_cache_config 0
 -gpgpu_l1_banks 4
 -gpgpu_cache:dl1  S:1:128:512,L:L:s:N:L,A:256:8,16:0,32
diff --git a/configs/tested-cfgs/SM7_GV100/gpgpusim.config b/configs/tested-cfgs/SM7_GV100/gpgpusim.config
index 26ce0eb58..4887be8cf 100644
--- a/configs/tested-cfgs/SM7_GV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_GV100/gpgpusim.config
@@ -137,8 +137,8 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
-# ** Optional parameter - Required when mshr_type==Texture Fifo
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
+# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
 # Defualt config is 32KB DL1 and 96KB shared memory
 # In Volta, we assign the remaining shared memory to L1 cache 
 # if the assigned shd mem = 0, then L1 cache = 128KB
diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index b3384afcb..a14ae7567 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -137,8 +137,8 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
-# ** Optional parameter - Required when mshr_type==Texture Fifo
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
+# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
 # Defualt config is 32KB DL1 and 96KB shared memory
 # In Volta, we assign the remaining shared memory to L1 cache 
 # if the assigned shd mem = 0, then L1 cache = 128KB
diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
index c37aaf053..b48f37006 100644
--- a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
@@ -107,8 +107,8 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
-# ** Optional parameter - Required when mshr_type==Texture Fifo
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
+# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
 # Defualt config is 32KB DL1 and 96KB shared memory
 # In Volta, we assign the remaining shared memory to L1 cache 
 # if the assigned shd mem = 0, then L1 cache = 128KB
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
index d26b1a621..c884541bb 100644
--- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -83,8 +83,8 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
-# ** Optional parameter - Required when mshr_type==Texture Fifo
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
+# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
 # In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 -gpgpu_adaptive_cache_config 1
diff --git a/format-code.sh b/format-code.sh
index acd33ab1c..b83510d20 100755
--- a/format-code.sh
+++ b/format-code.sh
@@ -1,13 +1,13 @@
 # This bash script formats GPGPU-Sim using clang-format
 THIS_DIR="$( cd "$( dirname "$BASH_SOURCE" )" && pwd )"
 echo "Running clang-format on $THIS_DIR"
-clang-format -i ${THIS_DIR}/libcuda/*.h
-clang-format -i ${THIS_DIR}/libcuda/*.cc
-clang-format -i ${THIS_DIR}/src/*.h
-clang-format -i ${THIS_DIR}/src/*.cc
-clang-format -i ${THIS_DIR}/src/gpgpu-sim/*.h
-clang-format -i ${THIS_DIR}/src/gpgpu-sim/*.cc
-clang-format -i ${THIS_DIR}/src/cuda-sim/*.h
-clang-format -i ${THIS_DIR}/src/cuda-sim/*.cc
-clang-format -i ${THIS_DIR}/src/accelwattch/*.h
-clang-format -i ${THIS_DIR}/src/accelwattch/*.cc
\ No newline at end of file
+clang-format -i ${THIS_DIR}/libcuda/*.h --style=file:${THIS_DIR}/.clang-format
+clang-format -i ${THIS_DIR}/libcuda/*.cc --style=file:${THIS_DIR}/.clang-format
+clang-format -i ${THIS_DIR}/src/*.h --style=file:${THIS_DIR}/.clang-format
+clang-format -i ${THIS_DIR}/src/*.cc --style=file:${THIS_DIR}/.clang-format
+clang-format -i ${THIS_DIR}/src/gpgpu-sim/*.h --style=file:${THIS_DIR}/.clang-format
+clang-format -i ${THIS_DIR}/src/gpgpu-sim/*.cc --style=file:${THIS_DIR}/.clang-format
+clang-format -i ${THIS_DIR}/src/cuda-sim/*.h --style=file:${THIS_DIR}/.clang-format
+clang-format -i ${THIS_DIR}/src/cuda-sim/*.cc --style=file:${THIS_DIR}/.clang-format
+clang-format -i ${THIS_DIR}/src/accelwattch/*.h --style=file:${THIS_DIR}/.clang-format
+clang-format -i ${THIS_DIR}/src/accelwattch/*.cc --style=file:${THIS_DIR}/.clang-format
diff --git a/gpgpusim_check.cmake b/gpgpusim_check.cmake
index 486d66dc9..7adf34286 100644
--- a/gpgpusim_check.cmake
+++ b/gpgpusim_check.cmake
@@ -63,8 +63,8 @@ else()
     message(CHECK_PASS "${CUDAToolkit_NVCC_EXECUTABLE}")
     message(CHECK_START "Checking CUDA compiler version")
     message(CHECK_PASS "${CUDAToolkit_VERSION}")
-    if((CUDAToolkit_VERSION VERSION_LESS 2.0.3) OR (CUDAToolkit_VERSION VERSION_GREATER 11.10.0))
-        message(FATAL_ERROR "GPGPU-Sim ${CMAKE_PROJECT_VERSION} not tested with CUDA version ${CUDAToolkit_VERSION} (please see README)")
+    if((CUDAToolkit_VERSION VERSION_LESS 2.0.3) OR (CUDAToolkit_VERSION VERSION_GREATER 13.0.0))
+        message(WARNING "GPGPU-Sim not tested with CUDA version ${CUDAToolkit_VERSION} (please see README)")
     endif()
 endif()
 
@@ -132,4 +132,4 @@ list(POP_BACK CMAKE_MESSAGE_INDENT)
 message(CHECK_PASS "done")
 message(STATUS "Be sure to run 'source setup' "
                "before you run CUDA program with GPGPU-Sim or building with external "
-               "simulator like SST")
\ No newline at end of file
+               "simulator like SST")
diff --git a/libcuda/cuda_api.h b/libcuda/cuda_api.h
index 5a970ba01..52f36ebdb 100644
--- a/libcuda/cuda_api.h
+++ b/libcuda/cuda_api.h
@@ -2607,12 +2607,12 @@ typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st {
 /**
  * Device that represents the CPU
  */
-#define CU_DEVICE_CPU ((CUdevice)-1)
+#define CU_DEVICE_CPU ((CUdevice) - 1)
 
 /**
  * Device that represents an invalid device
  */
-#define CU_DEVICE_INVALID ((CUdevice)-2)
+#define CU_DEVICE_INVALID ((CUdevice) - 2)
 
 /** @} */ /* END CUDA_TYPES */
 
diff --git a/libcuda/cuda_api_object.h b/libcuda/cuda_api_object.h
index e620e5728..3e34641ed 100644
--- a/libcuda/cuda_api_object.h
+++ b/libcuda/cuda_api_object.h
@@ -35,9 +35,7 @@ struct _cuda_device_id {
     m_next = NULL;
     m_gpgpu = gpu;
   }
-  struct _cuda_device_id *next() {
-    return m_next;
-  }
+  struct _cuda_device_id *next() { return m_next; }
   unsigned num_shader() const { return m_gpgpu->get_config().num_shader(); }
   int num_devices() const {
     if (m_next == NULL)
@@ -158,9 +156,7 @@ class kernel_config {
   void set_grid_dim(dim3 *d) { m_GridDim = *d; }
   void set_block_dim(dim3 *d) { m_BlockDim = *d; }
   gpgpu_ptx_sim_arg_list_t get_args() { return m_args; }
-  struct CUstream_st *get_stream() {
-    return m_stream;
-  }
+  struct CUstream_st *get_stream() { return m_stream; }
 
  private:
   dim3 m_GridDim;
diff --git a/libcuda/cuda_runtime_api.cc b/libcuda/cuda_runtime_api.cc
index 3574fc17d..8de12a335 100644
--- a/libcuda/cuda_runtime_api.cc
+++ b/libcuda/cuda_runtime_api.cc
@@ -1809,6 +1809,8 @@ cudaDeviceGetAttributeInternal(int *value, enum cudaDeviceAttr attr, int device,
       case 19:
         *value = 0;
         break;
+      case 20:  // cudaDevAttrComputeMode for controlling cudaSetDevice for threads
+        *value = 0; // Dummy value, should not affect simulation
       case 21:
       case 22:
       case 23:
@@ -2120,8 +2122,10 @@ __host__ cudaError_t CUDARTAPI cudaStreamSynchronizeInternal(
     announce_call(__my_func__);
   }
 #if (CUDART_VERSION >= 3000)
-  if (stream == NULL) ctx->synchronize();
-  return g_last_cudaError = cudaSuccess;
+  if (stream == NULL) {
+    ctx->synchronize();
+    return g_last_cudaError = cudaSuccess;
+  }
   stream->synchronize();
 #else
   printf(
@@ -2427,6 +2431,18 @@ void SST_gpgpusim_numcores_equal_check(unsigned sst_numcores) {
       ->SST_gpgpusim_numcores_equal_check(sst_numcores);
 }
 
+/**
+ * @brief For SST to check if kernel launch is blocking
+ *        Future: we will need a better interface to the 
+ *        GPGPU-Sim config for integration with outside
+ *        simulators.
+ * 
+ */
+bool SST_gpgpusim_launch_blocking() {
+  return GPGPU_Context()->the_gpgpusim->g_stream_manager->is_blocking();
+
+}
+
 uint64_t cudaMallocSST(void **devPtr, size_t size) {
   if (g_debug_execution >= 3) {
     announce_call(__my_func__);
@@ -2977,6 +2993,40 @@ __host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t stream) {
   return cudaStreamSynchronizeInternal(stream);
 }
 
+__host__ cudaError_t CUDARTAPI cudaStreamSynchronizeSST(cudaStream_t stream) {
+  // For SST, perform a one-time check
+  gpgpu_context *ctx = GPGPU_Context();
+  if (g_debug_execution >= 3) {
+    announce_call(__my_func__);
+  }
+
+  // default stream: all is done
+  // other streams: no more ops
+  g_last_cudaError = cudaSuccess;
+  if (stream == NULL) {
+    // For default stream, sync is equivalent to cudaThreadSync
+    bool thread_synced = ctx->synchronize_check();
+    if (thread_synced) {
+      // We are already done, so no need to poll for sync done
+      return cudaSuccess;
+    } else {
+      // Otherwise we mark we should wait for default strem to sync
+      ctx->the_gpgpusim->g_stream_manager->get_stream_zero()->set_request_synchronize();
+      return cudaErrorNotReady;
+    }
+  } else {
+    // For other stream, check if it is already sync'ed
+    bool stream_synced = stream->synchronize_check();
+    if (stream_synced) {
+      return cudaSuccess;
+    } else {
+      stream->set_request_synchronize();
+      return cudaErrorNotReady;
+    }
+  }
+  return g_last_cudaError = cudaSuccess;
+}
+
 __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream) {
   if (g_debug_execution >= 3) {
     announce_call(__my_func__);
@@ -3046,13 +3096,34 @@ __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event) {
   printf("GPGPU-Sim API: cudaEventSynchronize ** waiting for event\n");
   fflush(stdout);
   CUevent_st *e = (CUevent_st *)event;
-  while (!e->done())
-    ;
+  while (!e->done());
   printf("GPGPU-Sim API: cudaEventSynchronize ** event detected\n");
   fflush(stdout);
   return g_last_cudaError = cudaSuccess;
 }
 
+__host__ cudaError_t CUDARTAPI cudaEventSynchronizeSST(cudaEvent_t event) {
+  // For SST, perform a one-time check
+  // and let stream manager send the callback once the event is done
+  if (g_debug_execution >= 3) {
+    announce_call(__my_func__);
+  }
+  printf("GPGPU-Sim API: cudaEventSynchronize ** waiting for event\n");
+  fflush(stdout);
+  CUevent_st *e = (CUevent_st *)event;
+  bool event_sync_done = e->done();
+  if (event_sync_done) {
+    printf("GPGPU-Sim API: cudaEventSynchronize ** event detected\n");
+    fflush(stdout);
+    return cudaSuccess;
+  } else {
+    printf("GPGPU-Sim API: cudaEventSynchronize ** still waiting for event\n");
+    // Mark this event as waiting for synchronization
+    e->set_request_synchronize();
+    return cudaErrorNotReady;
+  }
+}
+
 __host__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event) {
   if (g_debug_execution >= 3) {
     announce_call(__my_func__);
@@ -3112,6 +3183,7 @@ __host__ cudaError_t CUDARTAPI cudaThreadSynchronizeSST(void) {
     ctx->requested_synchronize = false;
     return cudaSuccess;
   } else {
+    ctx->requested_synchronize = true;
     return cudaErrorNotReady;
   }
 }
@@ -3171,7 +3243,7 @@ __host__ cudaError_t CUDARTAPI cudaGetExportTable(
  *                                                                              *
  *******************************************************************************/
 
-//#include "../../cuobjdump_to_ptxplus/cuobjdump_parser.h"
+// #include "../../cuobjdump_to_ptxplus/cuobjdump_parser.h"
 
 // extracts all ptx files from binary and dumps into
 // prog_name.unique_no.sm_<>.ptx files
@@ -4021,6 +4093,18 @@ cudaError_t CUDARTAPI cudaSetDeviceFlags(int flags) {
   }
 }
 
+cudaError_t CUDARTAPI cudaSetDeviceFlagsSST(int flags) {
+  if (g_debug_execution >= 3) {
+    announce_call(__my_func__);
+  }
+  // SST's simple stream example relies on this
+  // currently just set it to no-op
+  printf(
+    "GPGPU-Sim PTX: Execution warning: ignoring call to \"%s ( flag=%p)\"\n",
+    __my_func__, flags);
+  return cudaSuccess;
+}
+
 cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr,
                                             const char *hostFun) {
   return cudaFuncGetAttributesInternal(attr, hostFun);
@@ -4068,9 +4152,9 @@ __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit,
   return g_last_cudaError = cudaSuccess;
 }
 
-//#if CUDART_VERSION >= 9000
+// #if CUDART_VERSION >= 9000
 //__host__  cudaError_t cudaFuncSetAttribute ( const void* func, enum
-// cudaFuncAttribute attr, int value ) {
+//  cudaFuncAttribute attr, int value ) {
 
 // ignore this Attribute for now, and the default is that carveout =
 // cudaSharedmemCarveoutDefault;   //  (-1)
diff --git a/linux-so-version.txt b/linux-so-version.txt
index 3db07aca6..55d372748 100644
--- a/linux-so-version.txt
+++ b/linux-so-version.txt
@@ -10,5 +10,7 @@ libcudart.so.10.1{
 };
 libcudart.so.11.0{
 };
+libcudart.so.12{
+};
 libcuda.so.1{
 };
diff --git a/new b/new
deleted file mode 100644
index 064543fd2..000000000
--- a/new
+++ /dev/null
@@ -1,7 +0,0 @@
-sudo docker run --dns 206.87.227.99 --privileged -v `pwd`:/home/runner/gpgpu-sim_distribution:rw tgrogers/gpgpu-sim_regress:volta_update /bin/bash -c "./start_torque.sh; chown -R runner /home/runner/gpgpu-sim_distribution; su - runner -c 'export CUDA_INSTALL_PATH=/usr/local/cuda-4.2/ && export PTXAS_CUDA_INSTALL_PATH=/usr/local/cuda-4.2/ && make clean -C /home/runner/gpgpu-sim_distribution && source /home/runner/gpgpu-sim_distribution/setup_environment && make -j -C /home/runner/gpgpu-sim_distribution && cd /home/runner/gpgpu-sim_simulations/ && git pull && /home/runner/gpgpu-sim_simulations/util/job_launching/run_simulations.py -C GTX480 -B rodinia_2.0-ft -N regress && /home/runner/gpgpu-sim_simulations/util/job_launching/monitor_func_test.py -v -N regress'"
-#    - services: docker
-#      env:
-#        - CONFIG=GTX480
-#        - CUDA_INSTALL_PATH=/usr/local/cuda-4.2/
-#        - PTXAS_CUDA_INSTALL_PATH=/usr/local/cuda-4.2/
-
diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc
index e8ddf95ab..8743cc7a7 100644
--- a/src/abstract_hardware_model.cc
+++ b/src/abstract_hardware_model.cc
@@ -216,20 +216,20 @@ new_addr_type line_size_based_tag_func(new_addr_type address,
   return address & ~(line_size - 1);
 }
 
-const char *mem_access_type_str(enum mem_access_type access_type) {
+const char *mem_access_type_str(enum mem_access_type access_type){
 #define MA_TUP_BEGIN(X) static const char *access_type_str[] = {
 #define MA_TUP(X) #X
 #define MA_TUP_END(X) \
   }                   \
   ;
-  MEM_ACCESS_TYPE_TUP_DEF
+    MEM_ACCESS_TYPE_TUP_DEF
 #undef MA_TUP_BEGIN
 #undef MA_TUP
 #undef MA_TUP_END
 
-  assert(access_type < NUM_MEM_ACCESS_TYPE);
+        assert(access_type < NUM_MEM_ACCESS_TYPE);
 
-  return access_type_str[access_type];
+return access_type_str[access_type];
 }
 
 void warp_inst_t::clear_active(const active_mask_t &inactive) {
diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 98a403997..cddf523e7 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -248,9 +248,7 @@ class kernel_info_t {
   }
   bool running() const { return m_num_cores_running > 0; }
   bool done() const { return no_more_ctas_to_run() && !running(); }
-  class function_info *entry() {
-    return m_kernel_entry;
-  }
+  class function_info *entry() { return m_kernel_entry; }
   const class function_info *entry() const { return m_kernel_entry; }
 
   size_t num_blocks() const {
@@ -300,9 +298,7 @@ class kernel_info_t {
   std::list<class ptx_thread_info *> &active_threads() {
     return m_active_threads;
   }
-  class memory_space *get_param_memory() {
-    return m_param_mem;
-  }
+  class memory_space *get_param_memory() { return m_param_mem; }
 
   // The following functions access texture bindings present at the kernel's
   // launch
@@ -609,15 +605,9 @@ class gpgpu_t {
   void memcpy_from_gpu(void *dst, size_t src_start_addr, size_t count);
   void memcpy_gpu_to_gpu(size_t dst, size_t src, size_t count);
 
-  class memory_space *get_global_memory() {
-    return m_global_mem;
-  }
-  class memory_space *get_tex_memory() {
-    return m_tex_mem;
-  }
-  class memory_space *get_surf_memory() {
-    return m_surf_mem;
-  }
+  class memory_space *get_global_memory() { return m_global_mem; }
+  class memory_space *get_tex_memory() { return m_tex_mem; }
+  class memory_space *get_surf_memory() { return m_surf_mem; }
 
   void gpgpu_ptx_sim_bindTextureToArray(const struct textureReference *texref,
                                         const struct cudaArray *array);
@@ -701,6 +691,7 @@ struct gpgpu_ptx_sim_info {
   int cmem;
   int gmem;
   int regs;
+  int barriers;
   unsigned maxthreads;
   unsigned ptx_version;
   unsigned sm_target;
@@ -1336,9 +1327,7 @@ class core_t {
   virtual bool warp_waiting_at_barrier(unsigned warp_id) const = 0;
   virtual void checkExecutionStatusAndUpdate(warp_inst_t &inst, unsigned t,
                                              unsigned tid) = 0;
-  class gpgpu_sim *get_gpu() {
-    return m_gpu;
-  }
+  class gpgpu_sim *get_gpu() { return m_gpu; }
   void execute_warp_inst_t(warp_inst_t &inst, unsigned warpId = (unsigned)-1);
   bool ptx_thread_done(unsigned hw_thread_id) const;
   virtual void updateSIMTStack(unsigned warpId, warp_inst_t *inst);
@@ -1348,9 +1337,7 @@ class core_t {
   void get_pdom_stack_top_info(unsigned warpId, unsigned *pc,
                                unsigned *rpc) const;
   kernel_info_t *get_kernel_info() { return m_kernel; }
-  class ptx_thread_info **get_thread_info() {
-    return m_thread;
-  }
+  class ptx_thread_info **get_thread_info() { return m_thread; }
   unsigned get_warp_size() const { return m_warp_size; }
   void and_reduction(unsigned ctaid, unsigned barid, bool value) {
     reduction_storage[ctaid][barid] &= value;
diff --git a/src/accelwattch/XML_Parse.h b/src/accelwattch/XML_Parse.h
index 176b82f6e..7017c9002 100644
--- a/src/accelwattch/XML_Parse.h
+++ b/src/accelwattch/XML_Parse.h
@@ -40,9 +40,9 @@
 #ifndef XML_PARSE_H_
 #define XML_PARSE_H_
 
-//#ifdef WIN32
-//#define _CRT_SECURE_NO_DEPRECATE
-//#endif
+// #ifdef WIN32
+// #define _CRT_SECURE_NO_DEPRECATE
+// #endif
 
 #include <stdio.h>
 #include <string.h>
diff --git a/src/accelwattch/core.cc b/src/accelwattch/core.cc
index cbaefc7ff..fe11e058e 100644
--- a/src/accelwattch/core.cc
+++ b/src/accelwattch/core.cc
@@ -47,38 +47,39 @@
 #include "const.h"
 #include "io.h"
 #include "parameter.h"
-//#include "globalvar.h"
-// double exClockRate;
+// #include "globalvar.h"
+//  double exClockRate;
 //*********************
-// Operand collector (OC) modelling (Syed Gilani)
+//  Operand collector (OC) modelling (Syed Gilani)
 //*********************
-// The OCs are modelled similar to the GPGPU-Sim v3.x documentation and
-// nVIDIA patents.
-// the OC need the following GPGPU-Sim config options:
+//  The OCs are modelled similar to the GPGPU-Sim v3.x documentation and
+//  nVIDIA patents.
+//  the OC need the following GPGPU-Sim config options:
 //-gpgpu_num_reg_banks                    8 # Number of register banks (default
 //= 8) -gpgpu_reg_bank_use_warp_id                    0 # Use warp ID in mapping
-// registers to banks (default = off) -gpgpu_operand_collector_num_units_sp 6 #
-// number of collector units (default = 4)
-// -gpgpu_operand_collector_num_units_sfu 8 # number of collector units (default
-// = 4) -gpgpu_operand_collector_num_units_mem                    2 # number of
-// collector units (default = 2) -gpgpu_operand_collector_num_units_gen 0 #
-// number of collector units (default = 0)
+//  registers to banks (default = off) -gpgpu_operand_collector_num_units_sp 6 #
+//  number of collector units (default = 4)
+//  -gpgpu_operand_collector_num_units_sfu 8 # number of collector units
+//  (default = 4) -gpgpu_operand_collector_num_units_mem                    2 #
+//  number of collector units (default = 2)
+//  -gpgpu_operand_collector_num_units_gen 0 # number of collector units
+//  (default = 0)
 //-gpgpu_operand_collector_num_in_ports_sp                    1 # number of
-// collector unit in ports (default = 1)
+//  collector unit in ports (default = 1)
 //-gpgpu_operand_collector_num_in_ports_sfu                    1 # number of
-// collector unit in ports (default = 1)
+//  collector unit in ports (default = 1)
 //-gpgpu_operand_collector_num_in_ports_mem                    1 # number of
-// collector unit in ports (default = 1)
+//  collector unit in ports (default = 1)
 //-gpgpu_operand_collector_num_in_ports_gen                    0 # number of
-// collector unit in ports (default = 0)
+//  collector unit in ports (default = 0)
 //-gpgpu_operand_collector_num_out_ports_sp                    1 # number of
-// collector unit in ports (default = 1)
+//  collector unit in ports (default = 1)
 //-gpgpu_operand_collector_num_out_ports_sfu                    1 # number of
-// collector unit in ports (default = 1)
+//  collector unit in ports (default = 1)
 //-gpgpu_operand_collector_num_out_ports_mem                    1 # number of
-// collector unit in ports (default = 1)
+//  collector unit in ports (default = 1)
 //-gpgpu_operand_collector_num_out_ports_gen                    0 # number of
-// collector unit in ports (default = 0)
+//  collector unit in ports (default = 0)
 
 // The total number of collector units and their input ports, and the number of
 // register file banks determine the crossbar size.
@@ -1837,7 +1838,7 @@ MemManU::MemManU(ParseXML* XML_interface, int ithCore_,
   area.set_area(area.get_area() + dtlb->local_result.area);
   // output_data_csv(dtlb.tlb.local_result);
 }
-//#define FERMI
+// #define FERMI
 
 RegFU::RegFU(ParseXML* XML_interface, int ithCore_,
              InputParameter* interface_ip_, const CoreDynParam& dyn_p_,
@@ -2194,7 +2195,7 @@ EXECU::EXECU(ParseXML* XML_interface, int ithCore_,
             bypass.area.set_area(bypass.area.get_area()
     +fpTagBypass->area.get_area());
     }*/
-  }       /* if (coredynp.core_ty==Inorder) */
+  } /* if (coredynp.core_ty==Inorder) */
   else {  // OOO
     if (coredynp.scheu_ty == PhysicalRegFile) {
       /* For physical register based OOO,
diff --git a/src/accelwattch/iocontrollers.h b/src/accelwattch/iocontrollers.h
index 9d6c48a22..22df1fb0b 100644
--- a/src/accelwattch/iocontrollers.h
+++ b/src/accelwattch/iocontrollers.h
@@ -35,9 +35,9 @@
 
 #include "XML_Parse.h"
 #include "cacti/parameter.h"
-//#include "io.h"
+// #include "io.h"
 #include "array.h"
-//#include "Undifferentiated_Core_Area.h"
+// #include "Undifferentiated_Core_Area.h"
 #include <vector>
 #include "basic_components.h"
 
diff --git a/src/accelwattch/logic.cc b/src/accelwattch/logic.cc
index 7f401895c..3d3a1a463 100644
--- a/src/accelwattch/logic.cc
+++ b/src/accelwattch/logic.cc
@@ -656,7 +656,7 @@ FunctionalUnit::FunctionalUnit(ParseXML *XML_interface, int ithCore_,
     }
     per_access_energy *= 0.5;  // According to ARM data embedded processor has
                                // much lower per acc energy
-  }                            /* if (XML->sys.Embedded) */
+  } /* if (XML->sys.Embedded) */
   else {
     if (fu_type == FPU) {
       num_fu = coredynp.num_fpus;
diff --git a/src/accelwattch/memoryctrl.h b/src/accelwattch/memoryctrl.h
index 4ac55fc73..f065c0aeb 100644
--- a/src/accelwattch/memoryctrl.h
+++ b/src/accelwattch/memoryctrl.h
@@ -41,9 +41,9 @@
 
 #include "XML_Parse.h"
 #include "cacti/parameter.h"
-//#include "io.h"
+// #include "io.h"
 #include "array.h"
-//#include "Undifferentiated_Core_Area.h"
+// #include "Undifferentiated_Core_Area.h"
 #include <vector>
 #include "basic_components.h"
 
diff --git a/src/accelwattch/processor.cc b/src/accelwattch/processor.cc
index d5c7cdda8..a86b96dc9 100644
--- a/src/accelwattch/processor.cc
+++ b/src/accelwattch/processor.cc
@@ -664,24 +664,22 @@ void Processor::displayDeviceType(int device_type_, uint32_t indent) {
 
   switch (device_type_) {
     case 0:
-      cout << indent_str << "Device Type= "
-           << "ITRS high performance device type" << endl;
+      cout << indent_str
+           << "Device Type= " << "ITRS high performance device type" << endl;
       break;
     case 1:
-      cout << indent_str << "Device Type= "
-           << "ITRS low standby power device type" << endl;
+      cout << indent_str
+           << "Device Type= " << "ITRS low standby power device type" << endl;
       break;
     case 2:
-      cout << indent_str << "Device Type= "
-           << "ITRS low operating power device type" << endl;
+      cout << indent_str
+           << "Device Type= " << "ITRS low operating power device type" << endl;
       break;
     case 3:
-      cout << indent_str << "Device Type= "
-           << "LP-DRAM device type" << endl;
+      cout << indent_str << "Device Type= " << "LP-DRAM device type" << endl;
       break;
     case 4:
-      cout << indent_str << "Device Type= "
-           << "COMM-DRAM device type" << endl;
+      cout << indent_str << "Device Type= " << "COMM-DRAM device type" << endl;
       break;
     default: {
       cout << indent_str << "Unknown Device Type" << endl;
diff --git a/src/accelwattch/xmlParser.cc b/src/accelwattch/xmlParser.cc
index 780d2ad04..9f01ebe7e 100644
--- a/src/accelwattch/xmlParser.cc
+++ b/src/accelwattch/xmlParser.cc
@@ -85,10 +85,10 @@
 #endif
 #include "xmlParser.h"
 #ifdef _XMLWINDOWS
-//#ifdef _DEBUG
-//#define _CRTDBG_MAP_ALLOC
-//#include <crtdbg.h>
-//#endif
+// #ifdef _DEBUG
+// #define _CRTDBG_MAP_ALLOC
+// #include <crtdbg.h>
+// #endif
 #define WIN32_LEAN_AND_MEAN
 #include <Windows.h>  // to have IsTextUnicode, MultiByteToWideChar, WideCharToMultiByte to handle unicode files
 // to have "MessageBoxA" to display error messages for openFilHelper
@@ -3241,7 +3241,7 @@ XMLSTR XMLParserBase64Tool::encode(unsigned char *inbuf, unsigned int inlen,
     *(curr++) = base64EncodeTable[j >> 18];
     *(curr++) = base64EncodeTable[(j >> 12) & 0x3f];
     *(curr++) = base64EncodeTable[(j >> 6) & 0x3f];
-    *(curr++) = base64EncodeTable[(j)&0x3f];
+    *(curr++) = base64EncodeTable[(j) & 0x3f];
     if (formatted) {
       if (!k) {
         *(curr++) = _CXML('\n');
diff --git a/src/accelwattch/xmlParser.h b/src/accelwattch/xmlParser.h
index 71a1f5722..b5c077972 100644
--- a/src/accelwattch/xmlParser.h
+++ b/src/accelwattch/xmlParser.h
@@ -163,7 +163,7 @@
 
 // uncomment the next line if you want no support for wchar_t* (no need for the
 // <wchar.h> or <tchar.h> libraries anymore to compile)
-//#define XML_NO_WIDE_CHAR
+// #define XML_NO_WIDE_CHAR
 
 #ifdef XML_NO_WIDE_CHAR
 #undef _XMLWINDOWS
diff --git a/src/cuda-sim/cuda-sim.cc b/src/cuda-sim/cuda-sim.cc
index d05549cc3..69d1eb74f 100644
--- a/src/cuda-sim/cuda-sim.cc
+++ b/src/cuda-sim/cuda-sim.cc
@@ -2782,9 +2782,7 @@ void print_ptxinfo() {
   }
 }
 
-struct gpgpu_ptx_sim_info get_ptxinfo() {
-  return g_ptxinfo;
-}
+struct gpgpu_ptx_sim_info get_ptxinfo() { return g_ptxinfo; }
 
 std::map<unsigned, const char *> get_duplicate() { return g_duplicate; }
 
@@ -2801,6 +2799,8 @@ void ptxinfo_function(const char *fname) {
 
 void ptxinfo_regs(unsigned nregs) { g_ptxinfo.regs = nregs; }
 
+void ptxinfo_barriers(unsigned barriers) { g_ptxinfo.barriers = barriers; }
+
 void ptxinfo_lmem(unsigned declared, unsigned system) {
   g_ptxinfo.lmem = declared + system;
 }
diff --git a/src/cuda-sim/cuda-sim.h b/src/cuda-sim/cuda-sim.h
index 21e1ca058..b1caf0c64 100644
--- a/src/cuda-sim/cuda-sim.h
+++ b/src/cuda-sim/cuda-sim.h
@@ -101,8 +101,8 @@ class functionalCoreSim : public core_t {
   bool *m_warpAtBarrier;
 };
 
-#define RECONVERGE_RETURN_PC ((address_type)-2)
-#define NO_BRANCH_DIVERGENCE ((address_type)-1)
+#define RECONVERGE_RETURN_PC ((address_type) - 2)
+#define NO_BRANCH_DIVERGENCE ((address_type) - 1)
 address_type get_return_pc(void *thd);
 const char *get_ptxinfo_kname();
 void print_ptxinfo();
diff --git a/src/cuda-sim/half.h b/src/cuda-sim/half.h
index fab1a229b..67e607b5d 100644
--- a/src/cuda-sim/half.h
+++ b/src/cuda-sim/half.h
@@ -846,10 +846,8 @@ uint16 int2half_impl(T value) {
       bits |= 0x7BFF + (R != std::round_toward_zero);
   } else if (value) {
     unsigned int m = value, exp = 24;
-    for (; m < 0x400; m <<= 1, --exp)
-      ;
-    for (; m > 0x7FF; m >>= 1, ++exp)
-      ;
+    for (; m < 0x400; m <<= 1, --exp);
+    for (; m > 0x7FF; m >>= 1, ++exp);
     bits |= (exp << 10) + m;
     if (exp > 24) {
       if (R == std::round_to_nearest)
@@ -1274,8 +1272,7 @@ inline double half2float_impl(uint16 value, double, true_type) {
   int abs = value & 0x7FFF;
   if (abs) {
     hi |= 0x3F000000 << static_cast<unsigned>(abs >= 0x7C00);
-    for (; abs < 0x400; abs <<= 1, hi -= 0x100000)
-      ;
+    for (; abs < 0x400; abs <<= 1, hi -= 0x100000);
     hi += static_cast<uint32>(abs) << 10;
   }
   uint64 bits = static_cast<uint64>(hi) << 32;
@@ -2116,8 +2113,7 @@ struct functions {
   static half frexp(half arg, int *exp) {
     int m = arg.data_ & 0x7FFF, e = -14;
     if (m >= 0x7C00 || !m) return *exp = 0, arg;
-    for (; m < 0x400; m <<= 1, --e)
-      ;
+    for (; m < 0x400; m <<= 1, --e);
     return *exp = e + (m >> 10),
            half(binary, (arg.data_ & 0x8000) | 0x3800 | (m & 0x3FF));
   }
@@ -2135,8 +2131,7 @@ struct functions {
     unsigned int mask = (1 << (25 - e)) - 1, m = arg.data_ & mask;
     iptr->data_ = arg.data_ & ~mask;
     if (!m) return half(binary, arg.data_ & 0x8000);
-    for (; m < 0x400; m <<= 1, --e)
-      ;
+    for (; m < 0x400; m <<= 1, --e);
     return half(binary, static_cast<uint16>((arg.data_ & 0x8000) | (e << 10) |
                                             (m & 0x3FF)));
   }
@@ -2148,8 +2143,7 @@ struct functions {
   static half scalbln(half arg, long exp) {
     unsigned int m = arg.data_ & 0x7FFF;
     if (m >= 0x7C00 || !m) return arg;
-    for (; m < 0x400; m <<= 1, --exp)
-      ;
+    for (; m < 0x400; m <<= 1, --exp);
     exp += m >> 10;
     uint16 value = arg.data_ & 0x8000;
     if (exp > 30) {
@@ -2191,8 +2185,7 @@ struct functions {
     if (abs < 0x7C00) {
       int exp = (abs >> 10) - 15;
       if (abs < 0x400)
-        for (; abs < 0x200; abs <<= 1, --exp)
-          ;
+        for (; abs < 0x200; abs <<= 1, --exp);
       return exp;
     }
     if (abs > 0x7C00) return FP_ILOGBNAN;
@@ -2208,13 +2201,11 @@ struct functions {
     if (abs < 0x7C00) {
       int exp = (abs >> 10) - 15;
       if (abs < 0x400)
-        for (; abs < 0x200; abs <<= 1, --exp)
-          ;
+        for (; abs < 0x200; abs <<= 1, --exp);
       uint16 bits = (exp < 0) << 15;
       if (exp) {
         unsigned int m = std::abs(exp) << 6, e = 18;
-        for (; m < 0x400; m <<= 1, --e)
-          ;
+        for (; m < 0x400; m <<= 1, --e);
         bits |= (e << 10) + m;
       }
       return half(binary, bits);
diff --git a/src/cuda-sim/ptx_ir.h b/src/cuda-sim/ptx_ir.h
index b08a692d8..4a7d39b66 100644
--- a/src/cuda-sim/ptx_ir.h
+++ b/src/cuda-sim/ptx_ir.h
@@ -39,7 +39,7 @@
 #include <string>
 #include <vector>
 
-//#include "ptx.tab.h"
+// #include "ptx.tab.h"
 #include "ptx_sim.h"
 
 #include "memory.h"
diff --git a/src/cuda-sim/ptx_sim.h b/src/cuda-sim/ptx_sim.h
index 8eec922e4..7128f8e4e 100644
--- a/src/cuda-sim/ptx_sim.h
+++ b/src/cuda-sim/ptx_sim.h
@@ -340,9 +340,7 @@ class ptx_thread_info {
   dim3 get_ctaid() const { return m_ctaid; }
   dim3 get_tid() const { return m_tid; }
   dim3 get_ntid() const { return m_ntid; }
-  class gpgpu_sim *get_gpu() {
-    return (gpgpu_sim *)m_gpu;
-  }
+  class gpgpu_sim *get_gpu() { return (gpgpu_sim *)m_gpu; }
   unsigned get_hw_tid() const { return m_hw_tid; }
   unsigned get_hw_ctaid() const { return m_hw_ctaid; }
   unsigned get_hw_wid() const { return m_hw_wid; }
diff --git a/src/cuda-sim/ptxinfo.l b/src/cuda-sim/ptxinfo.l
index 51371e321..db81bca64 100644
--- a/src/cuda-sim/ptxinfo.l
+++ b/src/cuda-sim/ptxinfo.l
@@ -58,6 +58,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 "Compiling entry function"	TC; return FUNC;
 "Used"		TC; return USED;
 "registers"	TC; return REGS;
+"used"		TC; return USED;
+"barriers"	TC; return REGS;
 "bytes"		TC; return BYTES;
 "lmem"		TC; return LMEM;
 "smem"		TC; return SMEM;
@@ -65,11 +67,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 "gmem"		TC; return GMEM;
 "line"		TC; return LINE;
 "for"		TC; return FOR;
+"ms"		TC; return MS;
 "textures"	TC; return TEXTURES;
 "error   : Duplicate definition of"	TC; return DUPLICATE;
 "function"	TC; yylval->string_value = strdup(yytext); return FUNCTION;
 "variable"	TC; yylval->string_value = strdup(yytext); return VARIABLE;
 "fatal   : Ptx assembly aborted due to errors"	TC; return FATAL;
+"Compile time = "	TC; return COMPILETIME;
 
 [_A-Za-z$%][_0-9A-Za-z$]*  TC; yylval->string_value = strdup(yytext); return IDENTIFIER;
 [-]{0,1}[0-9]+	 TC; yylval->int_value =  atoi(yytext); return INT_OPERAND;
@@ -79,6 +83,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 "["     TC; return LEFT_SQUARE_BRACKET;
 "]"	TC; return RIGHT_SQUARE_BRACKET;
 ":"	TC; return COLON;
+"."	TC; return PERIOD;
 ";"	TC; return SEMICOLON;
 "'"	TC; return QUOTE;
 " " TC;
diff --git a/src/cuda-sim/ptxinfo.y b/src/cuda-sim/ptxinfo.y
index b30395834..722520761 100644
--- a/src/cuda-sim/ptxinfo.y
+++ b/src/cuda-sim/ptxinfo.y
@@ -49,6 +49,8 @@ typedef void * yyscan_t;
 %token FUNC
 %token USED
 %token REGS
+%token BARRIERS
+%token COMPILETIME
 %token BYTES
 %token LMEM
 %token SMEM
@@ -70,6 +72,8 @@ typedef void * yyscan_t;
 %token <string_value> FUNCTION
 %token <string_value> VARIABLE
 %token FATAL
+%token PERIOD
+%token MS
 
 %{
 	#include <stdlib.h>
@@ -81,6 +85,7 @@ typedef void * yyscan_t;
 	void yyerror(yyscan_t yyscanner, ptxinfo_data* ptxinfo, const char* msg);
 	void ptxinfo_function(const char *fname );
 	void ptxinfo_regs( unsigned nregs );
+	void ptxinfo_barriers( unsigned barriers );
 	void ptxinfo_lmem( unsigned declared, unsigned system );
 	void ptxinfo_gmem( unsigned declared, unsigned system );
 	void ptxinfo_smem( unsigned declared, unsigned system );
@@ -126,8 +131,10 @@ info: 	  USED INT_OPERAND REGS { ptxinfo_regs($2); }
 	| INT_OPERAND BYTES LMEM { ptxinfo_lmem($1,0); }
 	| INT_OPERAND BYTES SMEM { ptxinfo_smem($1,0); }
 	| INT_OPERAND BYTES CMEM { ptxinfo_cmem($1,0); }
+	| USED INT_OPERAND BARRIERS { ptxinfo_barriers($2); }
 	| INT_OPERAND REGS { ptxinfo_regs($1); }
 	| INT_OPERAND TEXTURES {}
+	| COMPILETIME INT_OPERAND PERIOD INT_OPERAND MS {}
 	;
 
 tuple: INT_OPERAND PLUS INT_OPERAND BYTES { g_declared=$1; g_system=$3; }
diff --git a/src/gpgpu-sim/dram.cc b/src/gpgpu-sim/dram.cc
index 80e20d795..1e1ad3d02 100644
--- a/src/gpgpu-sim/dram.cc
+++ b/src/gpgpu-sim/dram.cc
@@ -684,13 +684,9 @@ bool dram_t::issue_row_command(int j) {
 }
 
 // if mrq is being serviced by dram, gets popped after CL latency fulfilled
-class mem_fetch *dram_t::return_queue_pop() {
-  return returnq->pop();
-}
+class mem_fetch *dram_t::return_queue_pop() { return returnq->pop(); }
 
-class mem_fetch *dram_t::return_queue_top() {
-  return returnq->top();
-}
+class mem_fetch *dram_t::return_queue_top() { return returnq->top(); }
 
 void dram_t::print(FILE *simFile) const {
   unsigned i;
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 55d70d115..3f84d42fc 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -247,12 +247,12 @@ void memory_config::reg_options(class OptionParser *opp) {
 
   option_parser_register(opp, "-l2_ideal", OPT_BOOL, &l2_ideal,
                          "Use a ideal L2 cache that always hit", "0");
-  option_parser_register(opp, "-gpgpu_cache:dl2", OPT_CSTR,
-                         &m_L2_config.m_config_string,
-                         "unified banked L2 data cache config "
-                         " {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_"
-                         "alloc>,<mshr>:<N>:<merge>,<mq>}",
-                         "64:128:8,L:B:m:N,A:16:4,4");
+  option_parser_register(
+      opp, "-gpgpu_cache:dl2", OPT_CSTR, &m_L2_config.m_config_string,
+      "unified banked L2 data cache config "
+      " {<sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_"
+      "index_fn>,<mshr>:<N>:<merge>,<mq>:<fifo_entry>,<data_port_width>",
+      "S:32:128:24,L:B:m:L:P,A:192:4,32:0,32");
   option_parser_register(opp, "-gpgpu_cache:dl2_texture_only", OPT_BOOL,
                          &m_L2_texure_only, "L2 cache used for texture only",
                          "1");
@@ -344,17 +344,18 @@ void shader_core_config::reg_options(class OptionParser *opp) {
       " {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<"
       "merge>,<mq>} ",
       "64:64:2,L:R:f:N,A:2:32,4");
-  option_parser_register(opp, "-gpgpu_cache:il1", OPT_CSTR,
-                         &m_L1I_config.m_config_string,
-                         "shader L1 instruction cache config "
-                         " {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_"
-                         "alloc>,<mshr>:<N>:<merge>,<mq>} ",
-                         "4:256:4,L:R:f:N,A:2:32,4");
+  option_parser_register(
+      opp, "-gpgpu_cache:il1", OPT_CSTR, &m_L1I_config.m_config_string,
+      "shader L1 instruction cache config "
+      " {<sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_"
+      "alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>} ",
+      "N:64:128:16,L:R:f:N:L,S:2:48,4");
   option_parser_register(opp, "-gpgpu_cache:dl1", OPT_CSTR,
                          &m_L1D_config.m_config_string,
                          "per-shader L1 data cache config "
-                         " {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_"
-                         "alloc>,<mshr>:<N>:<merge>,<mq> | none}",
+                         " {<sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<"
+                         "alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<"
+                         "mq>:<fifo_entry>,<data_port_width> | none}",
                          "none");
   option_parser_register(opp, "-gpgpu_l1_cache_write_ratio", OPT_UINT32,
                          &m_L1D_config.m_wr_percent, "L1D write ratio", "0");
@@ -374,14 +375,16 @@ void shader_core_config::reg_options(class OptionParser *opp) {
   option_parser_register(opp, "-gpgpu_cache:dl1PrefL1", OPT_CSTR,
                          &m_L1D_config.m_config_stringPrefL1,
                          "per-shader L1 data cache config "
-                         " {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_"
-                         "alloc>,<mshr>:<N>:<merge>,<mq> | none}",
+                         " {<sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<"
+                         "alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<"
+                         "mq>:<fifo_entry>,<data_port_width> | none}",
                          "none");
   option_parser_register(opp, "-gpgpu_cache:dl1PrefShared", OPT_CSTR,
                          &m_L1D_config.m_config_stringPrefShared,
                          "per-shader L1 data cache config "
-                         " {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_"
-                         "alloc>,<mshr>:<N>:<merge>,<mq> | none}",
+                         " {<sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<"
+                         "alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<"
+                         "mq>:<fifo_entry>,<data_port_width> | none}",
                          "none");
   option_parser_register(opp, "-gpgpu_gmem_skip_L1D", OPT_BOOL, &gmem_skip_L1D,
                          "global memory access skip L1D cache (implements "
diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index c1f50961f..68bdca72e 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -32,11 +32,11 @@
 #ifndef GPU_SIM_H
 #define GPU_SIM_H
 
+#include <stdint.h>
 #include <stdio.h>
 #include <fstream>
 #include <iostream>
 #include <list>
-#include <stdint.h>
 #include "../abstract_hardware_model.h"
 #include "../option_parser.h"
 #include "../trace.h"
@@ -879,7 +879,7 @@ class sst_gpgpu_sim : public gpgpu_sim {
    * @param dst_start_addr
    * @param count
    */
-  void perf_memcpy_to_gpu(size_t dst_start_addr, size_t count){};
+  void perf_memcpy_to_gpu(size_t dst_start_addr, size_t count) {};
 
   /**
    * @brief Check if the SST config matches up with the
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index 65c9c38b3..9d164d766 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -106,9 +106,7 @@ class memory_partition_unit {
 
   unsigned get_mpid() const { return m_id; }
 
-  class gpgpu_sim *get_mgpu() const {
-    return m_gpu;
-  }
+  class gpgpu_sim *get_mgpu() const { return m_gpu; }
 
  private:
   unsigned m_id;
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index ee10af664..0840d3f76 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -46,7 +46,7 @@
 #include <utility>
 #include <vector>
 
-//#include "../cuda-sim/ptx.tab.h"
+// #include "../cuda-sim/ptx.tab.h"
 
 #include "../abstract_hardware_model.h"
 #include "delayqueue.h"
@@ -273,9 +273,7 @@ class shd_warp_t {
   unsigned get_dynamic_warp_id() const { return m_dynamic_warp_id; }
   unsigned get_warp_id() const { return m_warp_id; }
 
-  class shader_core_ctx *get_shader() {
-    return m_shader;
-  }
+  class shader_core_ctx *get_shader() { return m_shader; }
 
  private:
   static const unsigned IBUFFER_SIZE = 2;
diff --git a/src/gpgpu-sim/visualizer.cc b/src/gpgpu-sim/visualizer.cc
index a832d6119..022201984 100644
--- a/src/gpgpu-sim/visualizer.cc
+++ b/src/gpgpu-sim/visualizer.cc
@@ -34,7 +34,7 @@
 #include "mem_latency_stat.h"
 #include "power_stat.h"
 #include "shader.h"
-//#include "../../../mcpat/processor.h"
+// #include "../../../mcpat/processor.h"
 #include "gpu-cache.h"
 #include "stat-tool.h"
 
diff --git a/src/gpgpusim_entrypoint.cc b/src/gpgpusim_entrypoint.cc
index e2b711ede..be492295f 100644
--- a/src/gpgpusim_entrypoint.cc
+++ b/src/gpgpusim_entrypoint.cc
@@ -56,7 +56,9 @@ class stream_manager *g_stream_manager() {
 
 // SST callback
 extern void SST_callback_cudaThreadSynchronize_done();
+extern void SST_callback_cudaStreamSynchronize_done(cudaStream_t stream);
 __attribute__((weak)) void SST_callback_cudaThreadSynchronize_done() {}
+__attribute__((weak)) void SST_callback_cudaStreamSynchronize_done(cudaStream_t stream) {}
 
 void *gpgpu_sim_thread_sequential(void *ctx_ptr) {
   gpgpu_context *ctx = (gpgpu_context *)ctx_ptr;
@@ -100,8 +102,7 @@ void *gpgpu_sim_thread_concurrent(void *ctx_ptr) {
       fflush(stdout);
     }
     while (ctx->the_gpgpusim->g_stream_manager->empty_protected() &&
-           !ctx->the_gpgpusim->g_sim_done)
-      ;
+           !ctx->the_gpgpusim->g_sim_done);
     if (g_debug_execution >= 3) {
       printf("GPGPU-Sim: ** START simulation thread (detected work) **\n");
       ctx->the_gpgpusim->g_stream_manager->print(stdout);
@@ -190,12 +191,33 @@ bool SST_Cycle() {
   // Check if Synchronize is done when SST previously requested
   // cudaThreadSynchronize
   if (GPGPU_Context()->requested_synchronize &&
-      ((g_stream_manager()->empty() && !GPGPUsim_ctx_ptr()->g_sim_active) ||
+      ((g_stream_manager()->empty_protected() && !GPGPUsim_ctx_ptr()->g_sim_active) ||
        GPGPUsim_ctx_ptr()->g_sim_done)) {
     SST_callback_cudaThreadSynchronize_done();
     GPGPU_Context()->requested_synchronize = false;
   }
 
+  // Polling to check for each stream if it is marked for requested with sync
+  if (g_stream_manager()->get_stream_zero()->requested_synchronize() &&
+      ((g_stream_manager()->empty_protected() && !GPGPUsim_ctx_ptr()->g_sim_active) || 
+      GPGPUsim_ctx_ptr()->g_sim_done)) {
+    SST_callback_cudaStreamSynchronize_done(0);
+    g_stream_manager()->get_stream_zero()->reset_request_synchronize();
+  }
+
+  // Iterate through each stream to check if SST is waiting on
+  // it and it does not have any operation
+  std::list<CUstream_st *>& streams = g_stream_manager()->get_concurrent_streams();
+  for (auto it = streams.begin(); it != streams.end(); it++) {
+    CUstream_st *stream = *it;
+    if (stream->requested_synchronize() &&
+        stream->empty()) {
+      // This stream is ready
+      SST_callback_cudaStreamSynchronize_done(stream);
+      stream->reset_request_synchronize();
+    }
+  }
+
   if (g_stream_manager()->empty_protected() &&
       !GPGPUsim_ctx_ptr()->g_sim_done && !g_the_gpu()->active()) {
     GPGPUsim_ctx_ptr()->g_sim_active = false;
@@ -273,7 +295,6 @@ void gpgpu_context::synchronize() {
 
 bool gpgpu_context::synchronize_check() {
   // printf("GPGPU-Sim: synchronize checking for inactive GPU simulation\n");
-  requested_synchronize = true;
   the_gpgpusim->g_stream_manager->print(stdout);
   fflush(stdout);
   //    sem_wait(&g_sim_signal_finish);
diff --git a/src/stream_manager.cc b/src/stream_manager.cc
index 58c2ec4b5..d43964a8f 100644
--- a/src/stream_manager.cc
+++ b/src/stream_manager.cc
@@ -34,15 +34,20 @@
 
 unsigned CUstream_st::sm_next_stream_uid = 0;
 
-// SST memcpy callbacks
-extern void SST_callback_memcpy_H2D_done();
-extern void SST_callback_memcpy_D2H_done();
+// SST memcpy callbacks, called after a stream operation is done via record_next_done()
+extern void SST_callback_memcpy_H2D_done(uint64_t dst, uint64_t src, size_t count, cudaStream_t stream);
+extern void SST_callback_memcpy_D2H_done(uint64_t dst, uint64_t src, size_t count, cudaStream_t stream);
 extern void SST_callback_memcpy_to_symbol_done();
 extern void SST_callback_memcpy_from_symbol_done();
-__attribute__((weak)) void SST_callback_memcpy_H2D_done() {}
-__attribute__((weak)) void SST_callback_memcpy_D2H_done() {}
+extern void SST_callback_cudaEventSynchronize_done(cudaEvent_t event);
+extern void SST_callback_kernel_done(cudaStream_t stream);
+__attribute__((weak)) void SST_callback_memcpy_H2D_done(uint64_t dst, uint64_t src, size_t count, cudaStream_t stream) {}
+__attribute__((weak)) void SST_callback_memcpy_D2H_done(uint64_t dst, uint64_t src, size_t count, cudaStream_t stream) {}
 __attribute__((weak)) void SST_callback_memcpy_to_symbol_done() {}
 __attribute__((weak)) void SST_callback_memcpy_from_symbol_done() {}
+__attribute__((weak)) void SST_callback_cudaEventSynchronize_done(cudaEvent_t event);
+__attribute__((weak)) void SST_callback_kernel_done(cudaStream_t stream);
+
 
 CUstream_st::CUstream_st() {
   m_pending = false;
@@ -74,6 +79,10 @@ void CUstream_st::synchronize() {
   } while (!done);
 }
 
+bool CUstream_st::synchronize_check() {
+  return m_operations.empty();
+}
+
 void CUstream_st::push(const stream_operation &op) {
   // called by host thread
   pthread_mutex_lock(&m_lock);
@@ -132,13 +141,15 @@ bool stream_operation::do_operation(gpgpu_sim *gpu) {
       if (g_debug_execution >= 3) printf("memcpy host-to-device\n");
       gpu->memcpy_to_gpu(m_device_address_dst, m_host_address_src, m_cnt);
       m_stream->record_next_done();
-      if (gpu->is_SST_mode()) SST_callback_memcpy_H2D_done();
+      if (gpu->is_SST_mode()) {
+        SST_callback_memcpy_H2D_done((uint64_t) m_device_address_dst, (uint64_t) m_host_address_src, m_cnt, m_stream->is_stream_zero_stream() ? 0 : m_stream);
+      }
       break;
     case stream_memcpy_device_to_host:
       if (g_debug_execution >= 3) printf("memcpy device-to-host\n");
       gpu->memcpy_from_gpu(m_host_address_dst, m_device_address_src, m_cnt);
       m_stream->record_next_done();
-      if (gpu->is_SST_mode()) SST_callback_memcpy_D2H_done();
+      if (gpu->is_SST_mode()) SST_callback_memcpy_D2H_done((uint64_t) m_host_address_dst, (uint64_t) m_device_address_src, m_cnt, m_stream->is_stream_zero_stream() ? 0 : m_stream);
       break;
     case stream_memcpy_device_to_device:
       if (g_debug_execution >= 3) printf("memcpy device-to-device\n");
@@ -194,6 +205,13 @@ bool stream_operation::do_operation(gpgpu_sim *gpu) {
       time_t wallclock = time((time_t *)NULL);
       m_event->update(gpu->gpu_tot_sim_cycle, wallclock);
       m_stream->record_next_done();
+      if ((gpu->is_SST_mode()) && m_event->done() &&
+          m_event->requested_synchronize()) {
+        // Notify that the event is done
+        SST_callback_cudaEventSynchronize_done(m_event);
+        // Reset the sync flag as we have notified SST
+        m_event->reset_request_synchronize();
+      }
     } break;
     case stream_wait_event:
       // only allows next op to go if event is done
@@ -252,6 +270,9 @@ stream_manager::stream_manager(gpgpu_sim *gpu, bool cuda_launch_blocking) {
   m_cuda_launch_blocking = cuda_launch_blocking;
   pthread_mutex_init(&m_lock, NULL);
   m_last_stream = m_streams.begin();
+
+  // Mark stream zero as the default stream
+  m_stream_zero.set_stream_zero();
 }
 
 bool stream_manager::operation(bool *sim) {
@@ -303,6 +324,11 @@ bool stream_manager::register_finished_kernel(unsigned grid_uid) {
       //            grid_uid, stream->get_uid()); kernel_stat.flush();
       //            kernel_stat.close();
       stream->record_next_done();
+      // Callback to notify a kernel is done for SST's stream
+      // manager to support with nonblocking + blocking kernel launch
+      if (m_gpu->is_SST_mode()) {
+        SST_callback_kernel_done(stream->is_stream_zero_stream() ? 0 : stream);
+      }
       m_grid_id_to_stream.erase(grid_uid);
       kernel->notify_parent_finished();
       delete kernel;
@@ -393,8 +419,7 @@ void stream_manager::add_stream(struct CUstream_st *stream) {
 void stream_manager::destroy_stream(CUstream_st *stream) {
   // called by host thread
   pthread_mutex_lock(&m_lock);
-  while (!stream->empty())
-    ;
+  while (!stream->empty());
   std::list<CUstream_st *>::iterator s;
   for (s = m_streams.begin(); s != m_streams.end(); s++) {
     if (*s == stream) {
diff --git a/src/stream_manager.h b/src/stream_manager.h
index 561f54b87..55cfb8d28 100644
--- a/src/stream_manager.h
+++ b/src/stream_manager.h
@@ -69,6 +69,11 @@ struct CUevent_st {
   void issue() { m_issued++; }
   unsigned int num_issued() const { return m_issued; }
 
+  // SST related, stating this event is requested to synchronize
+  void set_request_synchronize() { m_requested_synchronize = true; }
+  void reset_request_synchronize() { m_requested_synchronize = false; }
+  bool requested_synchronize() const { return m_requested_synchronize; }
+
  private:
   int m_uid;
   bool m_blocking;
@@ -77,6 +82,9 @@ struct CUevent_st {
   unsigned int m_issued;
   time_t m_wallclock;
   double m_gpu_tot_sim_cycle;
+  
+  // SST related
+  bool m_requested_synchronize = false;
 
   static int m_next_event_uid;
 };
@@ -198,9 +206,7 @@ class stream_operation {
   kernel_info_t *get_kernel() { return m_kernel; }
   bool do_operation(gpgpu_sim *gpu);
   void print(FILE *fp) const;
-  struct CUstream_st *get_stream() {
-    return m_stream;
-  }
+  struct CUstream_st *get_stream() { return m_stream; }
   void set_stream(CUstream_st *stream) { m_stream = stream; }
 
  private:
@@ -228,6 +234,7 @@ struct CUstream_st {
   bool empty();
   bool busy();
   void synchronize();
+  bool synchronize_check();
   void push(const stream_operation &op);
   void record_next_done();
   stream_operation next();
@@ -235,6 +242,12 @@ struct CUstream_st {
   stream_operation &front() { return m_operations.front(); }
   void print(FILE *fp);
   unsigned get_uid() const { return m_uid; }
+  void set_request_synchronize() { m_requested_synchronize = true; }
+  void reset_request_synchronize() { m_requested_synchronize = false; }
+  bool requested_synchronize() const { return m_requested_synchronize; }
+  void set_stream_zero() { is_stream_zero = true; }
+  bool is_stream_zero_stream() { return is_stream_zero; }
+  void reset_stream_zero() { is_stream_zero = false; }
 
  private:
   unsigned m_uid;
@@ -245,6 +258,11 @@ struct CUstream_st {
 
   pthread_mutex_t m_lock;  // ensure only one host or gpu manipulates stream
                            // operation at one time
+
+  // SST related, use to record the stream is requested to synchronize
+  bool m_requested_synchronize = false;
+  // Whether this is the default stream
+  bool is_stream_zero = false;
 };
 
 class stream_manager {
@@ -265,6 +283,8 @@ class stream_manager {
   void stop_all_running_kernels();
   unsigned size() { return m_streams.size(); };
   bool is_blocking() { return m_cuda_launch_blocking; };
+  CUstream_st *get_stream_zero() { return &m_stream_zero; };
+  std::list<CUstream_st *>& get_concurrent_streams() { return m_streams; };
 
  private:
   void print_impl(FILE *fp);