configs/tested-cfgs/SM7_TITANV/gpgpusim.config

# This config models the Volta Titan V
# For more info about volta architecture:
# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
# https://devblogs.nvidia.com/inside-volta/
# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf

# functional simulator specification
-gpgpu_ptx_instruction_classification 0
-gpgpu_ptx_sim_mode 0
-gpgpu_ptx_force_max_capability 70 

# Device Limits
-gpgpu_stack_size_limit 1024
-gpgpu_heap_size_limit 8388608
-gpgpu_runtime_sync_depth_limit 2
-gpgpu_runtime_pending_launch_count_limit 2048
-gpgpu_max_concurrent_kernel 128

# Compute Capability
-gpgpu_compute_capability_major 7
-gpgpu_compute_capability_minor 0

# SASS execution (only supported with CUDA >= 4.0)
-gpgpu_ptx_convert_to_ptxplus 0
-gpgpu_ptx_save_converted_ptxplus 0

# high level architecture configuration
-gpgpu_n_clusters 40
-gpgpu_n_cores_per_cluster 2
-gpgpu_n_mem 24
-gpgpu_n_sub_partition_per_mchannel 2 

# volta clock domains
#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
# Volta NVIDIA TITANV  clock domains are adopted from 
# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
-gpgpu_clock_domains 1200.0:1200.0:1200.0:850.0
# boost mode
# -gpgpu_clock_domains 1455.0:1455.0:1455.0:850.0

# shader core pipeline config
-gpgpu_shader_registers 65536
-gpgpu_registers_per_block 65536
-gpgpu_occupancy_sm_number 70

# This implies a maximum of 64 warps/SM
-gpgpu_shader_core_pipeline 2048:32 
-gpgpu_shader_cta 32
-gpgpu_simd_model 1 

# Pipeline widths and number of FUs
# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
## Volta TITANV has 4 SP SIMD units, 4 INT units, 4 SFU units, 4 DP units per core, 4 Tensor core units
## we need to scale the number of pipeline registers to be equal to the number of SP units
-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
-gpgpu_num_sp_units 4
-gpgpu_num_sfu_units 4
-gpgpu_num_dp_units 4
-gpgpu_num_int_units 4
-gpgpu_tensor_core_avail 1
-gpgpu_num_tensor_core_units 4

# Instruction latencies and initiation intervals
# "ADD,MAX,MUL,MAD,DIV"
# All Div operations are executed on SFU unit
# Throughput (initiation latency) are adopted from 
# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
-ptx_opcode_latency_int 4,13,4,5,145,32
-ptx_opcode_initiation_int 2,2,2,2,8,4
-ptx_opcode_latency_fp 4,13,4,5,39
-ptx_opcode_initiation_fp 2,2,2,2,4
-ptx_opcode_latency_dp 8,19,8,8,330
-ptx_opcode_initiation_dp 4,4,4,4,130
-ptx_opcode_latency_sfu 100
-ptx_opcode_initiation_sfu 8
-ptx_opcode_latency_tesnor 64
-ptx_opcode_initiation_tensor 64

# Volta has sub core model, in which each scheduler has its own register file and EUs
# i.e. schedulers are isolated
-gpgpu_sub_core_model 1
# disable specialized operand collectors and use generic operand collectors instead
-gpgpu_enable_specialized_operand_collector 0
-gpgpu_operand_collector_num_units_gen 8
-gpgpu_operand_collector_num_in_ports_gen 8
-gpgpu_operand_collector_num_out_ports_gen 8
# volta has 8 banks, 4 schedulers, two banks per scheduler
-gpgpu_num_reg_banks 8
-gpgpu_reg_file_port_throughput 2

# shared memory bankconflict detection 
-gpgpu_shmem_num_banks 32
-gpgpu_shmem_limited_broadcast 0
-gpgpu_shmem_warp_parts 1
-gpgpu_coalesce_arch 60

# Volta has four schedulers per core
-gpgpu_num_sched_per_core 4
# Greedy then oldest scheduler
-gpgpu_scheduler lrr
## In Volta, a warp scheduler can issue 1 inst per cycle
-gpgpu_max_insn_issue_per_warp 1
-gpgpu_dual_issue_diff_exec_units 1

## L1/shared memory configuration
# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
# Defualt config is 32KB DL1 and 96KB shared memory
# In Volta, we assign the remaining shared memory to L1 cache 
# if the assigned shd mem = 0, then L1 cache = 128KB
# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
# disable this mode in case of multi kernels/apps execution
-gpgpu_adaptive_cache_config 1
-gpgpu_shmem_option 0,8,16,32,64,96
-gpgpu_unified_l1d_size 128
# L1 cache configuration
-gpgpu_l1_banks 4
-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
-gpgpu_l1_cache_write_ratio 25
-gpgpu_gmem_skip_L1D 0
-gpgpu_l1_latency 20
-gpgpu_flush_l1_cache 1
-gpgpu_n_cluster_ejection_buffer_size 32
# shared memory configuration
-gpgpu_shmem_size 98304
-gpgpu_shmem_sizeDefault 98304
-gpgpu_shmem_per_block 65536
-gpgpu_smem_latency 20

# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 4.5MB L2 cache
-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
-gpgpu_cache:dl2_texture_only 0
-gpgpu_dram_partition_queues 64:64:64:64
-gpgpu_perf_sim_memcpy 1
-gpgpu_memory_partition_indexing 2

# 128 KB Inst.
-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
-gpgpu_inst_fetch_throughput 4
# 48 KB Tex
# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
# 64 KB Const
-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
-gpgpu_perfect_inst_const_cache 1

# interconnection
#-network_mode 1 
#-inter_config_file config_volta_islip.icnt
# use built-in local xbar
-network_mode 2
-icnt_in_buffer_limit 512
-icnt_out_buffer_limit 512
-icnt_subnets 2
-icnt_flit_size 40
-icnt_arbiter_algo 1

# memory partition latency config 
-gpgpu_l2_rop_latency 160
-dram_latency 100

# dram model config
-gpgpu_dram_scheduler 1
-gpgpu_frfcfs_dram_sched_queue_size 64
-gpgpu_dram_return_queue_size 192

# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
-gpgpu_n_mem_per_ctrlr 1
-gpgpu_dram_buswidth 16
-gpgpu_dram_burst_length 2
-dram_data_command_freq_ratio 2  # HBM is DDR
-gpgpu_mem_address_mask 1
-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS

# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
# Timing for 1 GHZ
# tRRDl and tWTR are missing, need to be added
#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"

# Timing for 850 MHZ, TITANV HBM runs at 850 MHZ
-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"

# HBM has dual bus interface, in which it can issue two col and row commands at a time
-dram_dual_bus_interface 1
# select lower bits for bnkgrp to increase bnkgrp parallelism
-dram_bnk_indexing_policy 1
-dram_bnkgrp_indexing_policy 1

-dram_seperate_write_queue_enable 1
-dram_write_queue_size 128:108:32

# stat collection
-gpgpu_memlatency_stat 14 
-gpgpu_runtime_stat 500
-enable_ptx_file_line_stats 1
-visualizer_enabled 0

# tracing functionality
#-trace_enabled 1
#-trace_components WARP_SCHEDULER,SCOREBOARD
#-trace_sampling_core 0