Skip to content

Commit 48936a7

Browse files
author
Yechen Liu
committed
Add SM6_GTX1080Ti/gpgpusim.config back
1 parent 8df4e81 commit 48936a7

File tree

1 file changed

+289
-0
lines changed

1 file changed

+289
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,289 @@
1+
# This config models the Volta
2+
# For more info about volta architecture:
3+
# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
4+
# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1#
5+
# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
6+
# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
7+
# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
8+
# https://devblogs.nvidia.com/inside-volta/
9+
# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
10+
11+
# functional simulator specification
12+
-gpgpu_ptx_instruction_classification 0
13+
-gpgpu_ptx_sim_mode 0
14+
-gpgpu_ptx_force_max_capability 70
15+
16+
# Device Limits
17+
-gpgpu_stack_size_limit 1024
18+
-gpgpu_heap_size_limit 8388608
19+
-gpgpu_runtime_sync_depth_limit 2
20+
-gpgpu_runtime_pending_launch_count_limit 2048
21+
-gpgpu_kernel_launch_latency 5000
22+
-gpgpu_TB_launch_latency 0
23+
24+
# Compute Capability
25+
-gpgpu_compute_capability_major 7
26+
-gpgpu_compute_capability_minor 0
27+
28+
# PTX execution-driven
29+
-gpgpu_ptx_convert_to_ptxplus 0
30+
-gpgpu_ptx_save_converted_ptxplus 0
31+
32+
# high level architecture configuration
33+
-gpgpu_n_clusters 80
34+
-gpgpu_n_cores_per_cluster 1
35+
-gpgpu_n_mem 32
36+
-gpgpu_n_sub_partition_per_mchannel 2
37+
38+
# volta clock domains
39+
#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
40+
-gpgpu_clock_domains 1132.0:1132.0:1132.0:850.0
41+
# boost mode
42+
# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
43+
44+
# shader core pipeline config
45+
-gpgpu_shader_registers 65536
46+
-gpgpu_registers_per_block 65536
47+
-gpgpu_occupancy_sm_number 70
48+
49+
# This implies a maximum of 64 warps/SM
50+
-gpgpu_shader_core_pipeline 2048:32
51+
-gpgpu_shader_cta 32
52+
-gpgpu_simd_model 1
53+
54+
# Pipeline widths and number of FUs
55+
# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
56+
## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
57+
## we need to scale the number of pipeline registers to be equal to the number of SP units
58+
-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
59+
-gpgpu_num_sp_units 4
60+
-gpgpu_num_sfu_units 4
61+
-gpgpu_num_dp_units 4
62+
-gpgpu_num_int_units 4
63+
-gpgpu_tensor_core_avail 1
64+
-gpgpu_num_tensor_core_units 4
65+
66+
# Instruction latencies and initiation intervals
67+
# "ADD,MAX,MUL,MAD,DIV"
68+
# All Div operations are executed on SFU unit
69+
-ptx_opcode_latency_int 4,13,4,5,145,21
70+
-ptx_opcode_initiation_int 2,2,2,2,8,4
71+
-ptx_opcode_latency_fp 4,13,4,5,39
72+
-ptx_opcode_initiation_fp 2,2,2,2,4
73+
-ptx_opcode_latency_dp 8,19,8,8,330
74+
-ptx_opcode_initiation_dp 4,4,4,4,130
75+
-ptx_opcode_latency_sfu 100
76+
-ptx_opcode_initiation_sfu 8
77+
-ptx_opcode_latency_tesnor 64
78+
-ptx_opcode_initiation_tensor 64
79+
80+
# Volta has sub core model, in which each scheduler has its own register file and EUs
81+
# i.e. schedulers are isolated
82+
-gpgpu_sub_core_model 1
83+
# disable specialized operand collectors and use generic operand collectors instead
84+
-gpgpu_enable_specialized_operand_collector 0
85+
-gpgpu_operand_collector_num_units_gen 8
86+
-gpgpu_operand_collector_num_in_ports_gen 8
87+
-gpgpu_operand_collector_num_out_ports_gen 8
88+
# volta has 8 banks, 4 schedulers, two banks per scheduler
89+
# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
90+
-gpgpu_num_reg_banks 16
91+
-gpgpu_reg_file_port_throughput 2
92+
93+
# shared memory bankconflict detection
94+
-gpgpu_shmem_num_banks 32
95+
-gpgpu_shmem_limited_broadcast 0
96+
-gpgpu_shmem_warp_parts 1
97+
-gpgpu_coalesce_arch 60
98+
99+
# Volta has four schedulers per core
100+
-gpgpu_num_sched_per_core 4
101+
# Greedy then oldest scheduler
102+
-gpgpu_scheduler gto
103+
## In Volta, a warp scheduler can issue 1 inst per cycle
104+
-gpgpu_max_insn_issue_per_warp 1
105+
-gpgpu_dual_issue_diff_exec_units 1
106+
107+
## L1/shared memory configuration
108+
# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
109+
# ** Optional parameter - Required when mshr_type==Texture Fifo
110+
# Defualt config is 32KB DL1 and 96KB shared memory
111+
# In Volta, we assign the remaining shared memory to L1 cache
112+
# if the assigned shd mem = 0, then L1 cache = 128KB
113+
# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
114+
# disable this mode in case of multi kernels/apps execution
115+
-gpgpu_adaptive_cache_config 1
116+
# Volta unified cache has four banks
117+
-gpgpu_l1_banks 4
118+
-gpgpu_cache:dl1 S:1:128:256,L:L:s:N:L,A:256:8,16:0,32
119+
-gpgpu_shmem_size 98304
120+
-gpgpu_shmem_sizeDefault 98304
121+
-gpgpu_shmem_per_block 65536
122+
-gpgpu_gmem_skip_L1D 0
123+
-gpgpu_n_cluster_ejection_buffer_size 32
124+
-gpgpu_l1_latency 20
125+
-gpgpu_smem_latency 20
126+
-gpgpu_flush_l1_cache 1
127+
128+
# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
129+
-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
130+
-gpgpu_cache:dl2_texture_only 0
131+
-gpgpu_dram_partition_queues 64:64:64:64
132+
-gpgpu_perf_sim_memcpy 1
133+
-gpgpu_memory_partition_indexing 2
134+
135+
# 128 KB Inst.
136+
-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
137+
-gpgpu_inst_fetch_throughput 4
138+
# 128 KB Tex
139+
# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
140+
-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
141+
# 64 KB Const
142+
-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
143+
-gpgpu_perfect_inst_const_cache 1
144+
145+
# interconnection
146+
#-network_mode 1
147+
#-inter_config_file config_volta_islip.icnt
148+
# use built-in local xbar
149+
-network_mode 2
150+
-icnt_in_buffer_limit 512
151+
-icnt_out_buffer_limit 512
152+
-icnt_subnets 2
153+
-icnt_flit_size 40
154+
-icnt_arbiter_algo 1
155+
156+
# memory partition latency config
157+
-gpgpu_l2_rop_latency 160
158+
-dram_latency 100
159+
160+
# dram model config
161+
-gpgpu_dram_scheduler 1
162+
-gpgpu_frfcfs_dram_sched_queue_size 64
163+
-gpgpu_dram_return_queue_size 192
164+
165+
# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
166+
-gpgpu_n_mem_per_ctrlr 1
167+
-gpgpu_dram_buswidth 16
168+
-gpgpu_dram_burst_length 2
169+
-dram_data_command_freq_ratio 2 # HBM is DDR
170+
-gpgpu_mem_address_mask 1
171+
-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
172+
173+
# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
174+
# Timing for 1 GHZ
175+
# tRRDl and tWTR are missing, need to be added
176+
#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
177+
# CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
178+
179+
# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
180+
-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
181+
CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
182+
183+
# HBM has dual bus interface, in which it can issue two col and row commands at a time
184+
-dram_dual_bus_interface 1
185+
# select lower bits for bnkgrp to increase bnkgrp parallelism
186+
-dram_bnk_indexing_policy 0
187+
-dram_bnkgrp_indexing_policy 1
188+
189+
#-dram_seperate_write_queue_enable 1
190+
#-dram_write_queue_size 64:56:32
191+
192+
# stat collection
193+
-gpgpu_memlatency_stat 14
194+
-gpgpu_runtime_stat 500
195+
-enable_ptx_file_line_stats 1
196+
-visualizer_enabled 0
197+
198+
# power model configs, disable it untill we create a real energy model for Volta
199+
-power_simulation_enabled 0
200+
201+
# tracing functionality
202+
#-trace_enabled 1
203+
#-trace_components WARP_SCHEDULER,SCOREBOARD,LIVENESS
204+
#-trace_sampling_core 0
205+
206+
### items for functional and timing simulation of UVM ###
207+
208+
# gddr size should be less than or equal to 1GB, in the unit of MB/GB
209+
-gddr_size 1GB
210+
211+
# size of gddr page, only 4KB and 2MB available
212+
-page_size 4KB
213+
214+
# number of tlb entries per SM
215+
-tlb_size 4096
216+
217+
# average page table walk latency (in core cycle)
218+
# for 4K page, set to 100 and for 2M page, set to 66
219+
-page_table_walk_latency 100
220+
221+
# page eviction policy
222+
# 0 - lru 2MB (default)
223+
# 1 - lru tree-based neighborhood
224+
# 2 - lru sequential locality 64K
225+
# 3 - random 4KB
226+
# 4 - LFU 2MB
227+
# 5 - lru 4KB
228+
-eviction_policy 0
229+
230+
# invalidate clean pages directly instead of writing back
231+
-invalidate_clean 0
232+
233+
# reserve percentage (e.g. 10 or 20) of accesses pages from eviction in hope that they will be accessed in next iteration
234+
-reserve_accessed_page_percent 0
235+
236+
# percentage of free page buffer to trigger the page eviction (e.g. 5 or 10)
237+
-percentage_of_free_page_buffer 0
238+
239+
# pcie bandwidth per direction
240+
-pcie_bandwidth 16.0GB/s
241+
242+
# enable/disable GMMU statistics profiling for UVM
243+
-sim_prof_enable 1
244+
245+
# disable deadlock check for UVM
246+
-gpgpu_deadlock_detect 0
247+
248+
# latency to process kernel launch (150us or 222150 core cycles)
249+
#-gpgpu_kernel_launch_latency 222150
250+
251+
# hardware prefetcher
252+
# 0 - disabled
253+
# 1 - tree-based neighborhood (default)
254+
# 2 - sequential locality 64K
255+
# 3 - random 4 K
256+
-hardware_prefetch 1
257+
258+
# hardware prefetcher under over-subscription
259+
# 0 - disable upon eviction (default)
260+
# 1 - tree-based neighborhood
261+
# 2 - sequential locality 64K
262+
# 3 - random 4 K
263+
-hwprefetch_oversub 1
264+
265+
# latency in core cycle to handle page fault (45us)
266+
# encompass the overhead of stalling threads, deciding memory address, page table walk, maintaining page flags, transfer chunks and orders
267+
-page_fault_latency 66645
268+
269+
# enabling accurate simulation for stalling warps and serializing accesses for page fault handling (default 0)
270+
-enable_accurate_simulation 0
271+
272+
# Enable direct CPU-memory access from GPU
273+
# 0 - disable
274+
# 1 - adaptive
275+
# 2 - always
276+
# 3 - after oversubscription
277+
-enable_dma 0
278+
279+
# Access counter threshold for migrating the page from cpu to gpu
280+
-migrate_threshold 8
281+
282+
# Oversubscription Multiplicative Penalty Factor for Adaptive DMA
283+
-multiply_dma_penalty 2
284+
285+
# enabling access pattern detection, policy engine, and adaptive memory management
286+
-enable_smart_runtime 0
287+
288+
# enabling skip cycles when all warps stall and wait for page fault come back
289+
-skip_cycles_enable 1

0 commit comments

Comments
 (0)