-
Notifications
You must be signed in to change notification settings - Fork 544
/
Copy pathgpgpusim.config
209 lines (181 loc) · 7.52 KB
/
gpgpusim.config
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# This config models the Volta Titan V
# For more info about volta architecture:
# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1#
# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
# https://devblogs.nvidia.com/inside-volta/
# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
# functional simulator specification
-gpgpu_ptx_instruction_classification 0
-gpgpu_ptx_sim_mode 0
-gpgpu_ptx_force_max_capability 70
# Device Limits
-gpgpu_stack_size_limit 1024
-gpgpu_heap_size_limit 8388608
-gpgpu_runtime_sync_depth_limit 2
-gpgpu_runtime_pending_launch_count_limit 2048
-gpgpu_max_concurrent_kernel 128
# Compute Capability
-gpgpu_compute_capability_major 7
-gpgpu_compute_capability_minor 0
# SASS execution (only supported with CUDA >= 4.0)
-gpgpu_ptx_convert_to_ptxplus 0
-gpgpu_ptx_save_converted_ptxplus 0
# high level architecture configuration
-gpgpu_n_clusters 40
-gpgpu_n_cores_per_cluster 2
-gpgpu_n_mem 24
-gpgpu_n_sub_partition_per_mchannel 2
# volta clock domains
#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
# Volta NVIDIA TITANV clock domains are adopted from
# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
-gpgpu_clock_domains 1200.0:1200.0:1200.0:850.0
# boost mode
# -gpgpu_clock_domains 1455.0:1455.0:1455.0:850.0
# shader core pipeline config
-gpgpu_shader_registers 65536
-gpgpu_registers_per_block 65536
-gpgpu_occupancy_sm_number 70
# This implies a maximum of 64 warps/SM
-gpgpu_shader_core_pipeline 2048:32
-gpgpu_shader_cta 32
-gpgpu_simd_model 1
# Pipeline widths and number of FUs
# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
## Volta TITANV has 4 SP SIMD units, 4 INT units, 4 SFU units, 4 DP units per core, 4 Tensor core units
## we need to scale the number of pipeline registers to be equal to the number of SP units
-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
-gpgpu_num_sp_units 4
-gpgpu_num_sfu_units 4
-gpgpu_num_dp_units 4
-gpgpu_num_int_units 4
-gpgpu_tensor_core_avail 1
-gpgpu_num_tensor_core_units 4
# Instruction latencies and initiation intervals
# "ADD,MAX,MUL,MAD,DIV"
# All Div operations are executed on SFU unit
# Throughput (initiation latency) are adopted from
# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
-ptx_opcode_latency_int 4,13,4,5,145,32
-ptx_opcode_initiation_int 2,2,2,2,8,4
-ptx_opcode_latency_fp 4,13,4,5,39
-ptx_opcode_initiation_fp 2,2,2,2,4
-ptx_opcode_latency_dp 8,19,8,8,330
-ptx_opcode_initiation_dp 4,4,4,4,130
-ptx_opcode_latency_sfu 100
-ptx_opcode_initiation_sfu 8
-ptx_opcode_latency_tesnor 64
-ptx_opcode_initiation_tensor 64
# Volta has sub core model, in which each scheduler has its own register file and EUs
# i.e. schedulers are isolated
-gpgpu_sub_core_model 1
# disable specialized operand collectors and use generic operand collectors instead
-gpgpu_enable_specialized_operand_collector 0
-gpgpu_operand_collector_num_units_gen 8
-gpgpu_operand_collector_num_in_ports_gen 8
-gpgpu_operand_collector_num_out_ports_gen 8
# volta has 8 banks, 4 schedulers, two banks per scheduler
-gpgpu_num_reg_banks 8
-gpgpu_reg_file_port_throughput 2
# shared memory bankconflict detection
-gpgpu_shmem_num_banks 32
-gpgpu_shmem_limited_broadcast 0
-gpgpu_shmem_warp_parts 1
-gpgpu_coalesce_arch 60
# Volta has four schedulers per core
-gpgpu_num_sched_per_core 4
# Greedy then oldest scheduler
-gpgpu_scheduler lrr
## In Volta, a warp scheduler can issue 1 inst per cycle
-gpgpu_max_insn_issue_per_warp 1
-gpgpu_dual_issue_diff_exec_units 1
## L1/shared memory configuration
# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
# Defualt config is 32KB DL1 and 96KB shared memory
# In Volta, we assign the remaining shared memory to L1 cache
# if the assigned shd mem = 0, then L1 cache = 128KB
# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
# disable this mode in case of multi kernels/apps execution
-gpgpu_adaptive_cache_config 1
-gpgpu_shmem_option 0,8,16,32,64,96
-gpgpu_unified_l1d_size 128
# L1 cache configuration
-gpgpu_l1_banks 4
-gpgpu_cache:dl1 S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
-gpgpu_l1_cache_write_ratio 25
-gpgpu_gmem_skip_L1D 0
-gpgpu_l1_latency 20
-gpgpu_flush_l1_cache 1
-gpgpu_n_cluster_ejection_buffer_size 32
# shared memory configuration
-gpgpu_shmem_size 98304
-gpgpu_shmem_sizeDefault 98304
-gpgpu_shmem_per_block 65536
-gpgpu_smem_latency 20
# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 4.5MB L2 cache
-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
-gpgpu_cache:dl2_texture_only 0
-gpgpu_dram_partition_queues 64:64:64:64
-gpgpu_perf_sim_memcpy 1
-gpgpu_memory_partition_indexing 2
# 128 KB Inst.
-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
-gpgpu_inst_fetch_throughput 4
# 48 KB Tex
# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
# 64 KB Const
-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
-gpgpu_perfect_inst_const_cache 1
# interconnection
#-network_mode 1
#-inter_config_file config_volta_islip.icnt
# use built-in local xbar
-network_mode 2
-icnt_in_buffer_limit 512
-icnt_out_buffer_limit 512
-icnt_subnets 2
-icnt_flit_size 40
-icnt_arbiter_algo 1
# memory partition latency config
-gpgpu_l2_rop_latency 160
-dram_latency 100
# dram model config
-gpgpu_dram_scheduler 1
-gpgpu_frfcfs_dram_sched_queue_size 64
-gpgpu_dram_return_queue_size 192
# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
-gpgpu_n_mem_per_ctrlr 1
-gpgpu_dram_buswidth 16
-gpgpu_dram_burst_length 2
-dram_data_command_freq_ratio 2 # HBM is DDR
-gpgpu_mem_address_mask 1
-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
# Timing for 1 GHZ
# tRRDl and tWTR are missing, need to be added
#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
# CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
# Timing for 850 MHZ, TITANV HBM runs at 850 MHZ
-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
# HBM has dual bus interface, in which it can issue two col and row commands at a time
-dram_dual_bus_interface 1
# select lower bits for bnkgrp to increase bnkgrp parallelism
-dram_bnk_indexing_policy 1
-dram_bnkgrp_indexing_policy 1
-dram_seperate_write_queue_enable 1
-dram_write_queue_size 128:108:32
# stat collection
-gpgpu_memlatency_stat 14
-gpgpu_runtime_stat 500
-enable_ptx_file_line_stats 1
-visualizer_enabled 0
# tracing functionality
#-trace_enabled 1
#-trace_components WARP_SCHEDULER,SCOREBOARD
#-trace_sampling_core 0