|
| 1 | +# This config models the Volta |
| 2 | +# For more info about volta architecture: |
| 3 | +# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf |
| 4 | +# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# |
| 5 | +# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf |
| 6 | +# https://en.wikipedia.org/wiki/Volta_(microarchitecture) |
| 7 | +# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf |
| 8 | +# https://devblogs.nvidia.com/inside-volta/ |
| 9 | +# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf |
| 10 | + |
| 11 | +# functional simulator specification |
| 12 | +-gpgpu_ptx_instruction_classification 0 |
| 13 | +-gpgpu_ptx_sim_mode 0 |
| 14 | +-gpgpu_ptx_force_max_capability 70 |
| 15 | + |
| 16 | +# Device Limits |
| 17 | +-gpgpu_stack_size_limit 1024 |
| 18 | +-gpgpu_heap_size_limit 8388608 |
| 19 | +-gpgpu_runtime_sync_depth_limit 2 |
| 20 | +-gpgpu_runtime_pending_launch_count_limit 2048 |
| 21 | +-gpgpu_kernel_launch_latency 5000 |
| 22 | +-gpgpu_TB_launch_latency 0 |
| 23 | + |
| 24 | +# Compute Capability |
| 25 | +-gpgpu_compute_capability_major 7 |
| 26 | +-gpgpu_compute_capability_minor 0 |
| 27 | + |
| 28 | +# PTX execution-driven |
| 29 | +-gpgpu_ptx_convert_to_ptxplus 0 |
| 30 | +-gpgpu_ptx_save_converted_ptxplus 0 |
| 31 | + |
| 32 | +# high level architecture configuration |
| 33 | +-gpgpu_n_clusters 80 |
| 34 | +-gpgpu_n_cores_per_cluster 1 |
| 35 | +-gpgpu_n_mem 32 |
| 36 | +-gpgpu_n_sub_partition_per_mchannel 2 |
| 37 | + |
| 38 | +# volta clock domains |
| 39 | +#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock> |
| 40 | +-gpgpu_clock_domains 1132.0:1132.0:1132.0:850.0 |
| 41 | +# boost mode |
| 42 | +# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0 |
| 43 | + |
| 44 | +# shader core pipeline config |
| 45 | +-gpgpu_shader_registers 65536 |
| 46 | +-gpgpu_registers_per_block 65536 |
| 47 | +-gpgpu_occupancy_sm_number 70 |
| 48 | + |
| 49 | +# This implies a maximum of 64 warps/SM |
| 50 | +-gpgpu_shader_core_pipeline 2048:32 |
| 51 | +-gpgpu_shader_cta 32 |
| 52 | +-gpgpu_simd_model 1 |
| 53 | + |
| 54 | +# Pipeline widths and number of FUs |
| 55 | +# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE |
| 56 | +## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units |
| 57 | +## we need to scale the number of pipeline registers to be equal to the number of SP units |
| 58 | +-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4 |
| 59 | +-gpgpu_num_sp_units 4 |
| 60 | +-gpgpu_num_sfu_units 4 |
| 61 | +-gpgpu_num_dp_units 4 |
| 62 | +-gpgpu_num_int_units 4 |
| 63 | +-gpgpu_tensor_core_avail 1 |
| 64 | +-gpgpu_num_tensor_core_units 4 |
| 65 | + |
| 66 | +# Instruction latencies and initiation intervals |
| 67 | +# "ADD,MAX,MUL,MAD,DIV" |
| 68 | +# All Div operations are executed on SFU unit |
| 69 | +-ptx_opcode_latency_int 4,13,4,5,145,21 |
| 70 | +-ptx_opcode_initiation_int 2,2,2,2,8,4 |
| 71 | +-ptx_opcode_latency_fp 4,13,4,5,39 |
| 72 | +-ptx_opcode_initiation_fp 2,2,2,2,4 |
| 73 | +-ptx_opcode_latency_dp 8,19,8,8,330 |
| 74 | +-ptx_opcode_initiation_dp 4,4,4,4,130 |
| 75 | +-ptx_opcode_latency_sfu 100 |
| 76 | +-ptx_opcode_initiation_sfu 8 |
| 77 | +-ptx_opcode_latency_tesnor 64 |
| 78 | +-ptx_opcode_initiation_tensor 64 |
| 79 | + |
| 80 | +# Volta has sub core model, in which each scheduler has its own register file and EUs |
| 81 | +# i.e. schedulers are isolated |
| 82 | +-gpgpu_sub_core_model 1 |
| 83 | +# disable specialized operand collectors and use generic operand collectors instead |
| 84 | +-gpgpu_enable_specialized_operand_collector 0 |
| 85 | +-gpgpu_operand_collector_num_units_gen 8 |
| 86 | +-gpgpu_operand_collector_num_in_ports_gen 8 |
| 87 | +-gpgpu_operand_collector_num_out_ports_gen 8 |
| 88 | +# volta has 8 banks, 4 schedulers, two banks per scheduler |
| 89 | +# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version |
| 90 | +-gpgpu_num_reg_banks 16 |
| 91 | +-gpgpu_reg_file_port_throughput 2 |
| 92 | + |
| 93 | +# shared memory bankconflict detection |
| 94 | +-gpgpu_shmem_num_banks 32 |
| 95 | +-gpgpu_shmem_limited_broadcast 0 |
| 96 | +-gpgpu_shmem_warp_parts 1 |
| 97 | +-gpgpu_coalesce_arch 60 |
| 98 | + |
| 99 | +# Volta has four schedulers per core |
| 100 | +-gpgpu_num_sched_per_core 4 |
| 101 | +# Greedy then oldest scheduler |
| 102 | +-gpgpu_scheduler gto |
| 103 | +## In Volta, a warp scheduler can issue 1 inst per cycle |
| 104 | +-gpgpu_max_insn_issue_per_warp 1 |
| 105 | +-gpgpu_dual_issue_diff_exec_units 1 |
| 106 | + |
| 107 | +## L1/shared memory configuration |
| 108 | +# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry> |
| 109 | +# ** Optional parameter - Required when mshr_type==Texture Fifo |
| 110 | +# Defualt config is 32KB DL1 and 96KB shared memory |
| 111 | +# In Volta, we assign the remaining shared memory to L1 cache |
| 112 | +# if the assigned shd mem = 0, then L1 cache = 128KB |
| 113 | +# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x |
| 114 | +# disable this mode in case of multi kernels/apps execution |
| 115 | +-gpgpu_adaptive_cache_config 1 |
| 116 | +# Volta unified cache has four banks |
| 117 | +-gpgpu_l1_banks 4 |
| 118 | +-gpgpu_cache:dl1 S:1:128:256,L:L:s:N:L,A:256:8,16:0,32 |
| 119 | +-gpgpu_shmem_size 98304 |
| 120 | +-gpgpu_shmem_sizeDefault 98304 |
| 121 | +-gpgpu_shmem_per_block 65536 |
| 122 | +-gpgpu_gmem_skip_L1D 0 |
| 123 | +-gpgpu_n_cluster_ejection_buffer_size 32 |
| 124 | +-gpgpu_l1_latency 20 |
| 125 | +-gpgpu_smem_latency 20 |
| 126 | +-gpgpu_flush_l1_cache 1 |
| 127 | + |
| 128 | +# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache |
| 129 | +-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32 |
| 130 | +-gpgpu_cache:dl2_texture_only 0 |
| 131 | +-gpgpu_dram_partition_queues 64:64:64:64 |
| 132 | +-gpgpu_perf_sim_memcpy 1 |
| 133 | +-gpgpu_memory_partition_indexing 2 |
| 134 | + |
| 135 | +# 128 KB Inst. |
| 136 | +-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4 |
| 137 | +-gpgpu_inst_fetch_throughput 4 |
| 138 | +# 128 KB Tex |
| 139 | +# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod |
| 140 | +-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2 |
| 141 | +# 64 KB Const |
| 142 | +-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4 |
| 143 | +-gpgpu_perfect_inst_const_cache 1 |
| 144 | + |
| 145 | +# interconnection |
| 146 | +#-network_mode 1 |
| 147 | +#-inter_config_file config_volta_islip.icnt |
| 148 | +# use built-in local xbar |
| 149 | +-network_mode 2 |
| 150 | +-icnt_in_buffer_limit 512 |
| 151 | +-icnt_out_buffer_limit 512 |
| 152 | +-icnt_subnets 2 |
| 153 | +-icnt_flit_size 40 |
| 154 | +-icnt_arbiter_algo 1 |
| 155 | + |
| 156 | +# memory partition latency config |
| 157 | +-gpgpu_l2_rop_latency 160 |
| 158 | +-dram_latency 100 |
| 159 | + |
| 160 | +# dram model config |
| 161 | +-gpgpu_dram_scheduler 1 |
| 162 | +-gpgpu_frfcfs_dram_sched_queue_size 64 |
| 163 | +-gpgpu_dram_return_queue_size 192 |
| 164 | + |
| 165 | +# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width |
| 166 | +-gpgpu_n_mem_per_ctrlr 1 |
| 167 | +-gpgpu_dram_buswidth 16 |
| 168 | +-gpgpu_dram_burst_length 2 |
| 169 | +-dram_data_command_freq_ratio 2 # HBM is DDR |
| 170 | +-gpgpu_mem_address_mask 1 |
| 171 | +-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS |
| 172 | + |
| 173 | +# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf) |
| 174 | +# Timing for 1 GHZ |
| 175 | +# tRRDl and tWTR are missing, need to be added |
| 176 | +#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47: |
| 177 | +# CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4" |
| 178 | + |
| 179 | +# Timing for 850 MHZ, V100 HBM runs at 850 MHZ |
| 180 | +-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40: |
| 181 | + CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3" |
| 182 | + |
| 183 | +# HBM has dual bus interface, in which it can issue two col and row commands at a time |
| 184 | +-dram_dual_bus_interface 1 |
| 185 | +# select lower bits for bnkgrp to increase bnkgrp parallelism |
| 186 | +-dram_bnk_indexing_policy 0 |
| 187 | +-dram_bnkgrp_indexing_policy 1 |
| 188 | + |
| 189 | +#-dram_seperate_write_queue_enable 1 |
| 190 | +#-dram_write_queue_size 64:56:32 |
| 191 | + |
| 192 | +# stat collection |
| 193 | +-gpgpu_memlatency_stat 14 |
| 194 | +-gpgpu_runtime_stat 500 |
| 195 | +-enable_ptx_file_line_stats 1 |
| 196 | +-visualizer_enabled 0 |
| 197 | + |
| 198 | +# power model configs, disable it untill we create a real energy model for Volta |
| 199 | +-power_simulation_enabled 0 |
| 200 | + |
| 201 | +# tracing functionality |
| 202 | +#-trace_enabled 1 |
| 203 | +#-trace_components WARP_SCHEDULER,SCOREBOARD,LIVENESS |
| 204 | +#-trace_sampling_core 0 |
| 205 | + |
| 206 | +### items for functional and timing simulation of UVM ### |
| 207 | + |
| 208 | +# gddr size should be less than or equal to 1GB, in the unit of MB/GB |
| 209 | +-gddr_size 1GB |
| 210 | + |
| 211 | +# size of gddr page, only 4KB and 2MB available |
| 212 | +-page_size 4KB |
| 213 | + |
| 214 | +# number of tlb entries per SM |
| 215 | +-tlb_size 4096 |
| 216 | + |
| 217 | +# average page table walk latency (in core cycle) |
| 218 | +# for 4K page, set to 100 and for 2M page, set to 66 |
| 219 | +-page_table_walk_latency 100 |
| 220 | + |
| 221 | +# page eviction policy |
| 222 | +# 0 - lru 2MB (default) |
| 223 | +# 1 - lru tree-based neighborhood |
| 224 | +# 2 - lru sequential locality 64K |
| 225 | +# 3 - random 4KB |
| 226 | +# 4 - LFU 2MB |
| 227 | +# 5 - lru 4KB |
| 228 | +-eviction_policy 0 |
| 229 | + |
| 230 | +# invalidate clean pages directly instead of writing back |
| 231 | +-invalidate_clean 0 |
| 232 | + |
| 233 | +# reserve percentage (e.g. 10 or 20) of accesses pages from eviction in hope that they will be accessed in next iteration |
| 234 | +-reserve_accessed_page_percent 0 |
| 235 | + |
| 236 | +# percentage of free page buffer to trigger the page eviction (e.g. 5 or 10) |
| 237 | +-percentage_of_free_page_buffer 0 |
| 238 | + |
| 239 | +# pcie bandwidth per direction |
| 240 | +-pcie_bandwidth 16.0GB/s |
| 241 | + |
| 242 | +# enable/disable GMMU statistics profiling for UVM |
| 243 | +-sim_prof_enable 1 |
| 244 | + |
| 245 | +# disable deadlock check for UVM |
| 246 | +-gpgpu_deadlock_detect 0 |
| 247 | + |
| 248 | +# latency to process kernel launch (150us or 222150 core cycles) |
| 249 | +#-gpgpu_kernel_launch_latency 222150 |
| 250 | + |
| 251 | +# hardware prefetcher |
| 252 | +# 0 - disabled |
| 253 | +# 1 - tree-based neighborhood (default) |
| 254 | +# 2 - sequential locality 64K |
| 255 | +# 3 - random 4 K |
| 256 | +-hardware_prefetch 1 |
| 257 | + |
| 258 | +# hardware prefetcher under over-subscription |
| 259 | +# 0 - disable upon eviction (default) |
| 260 | +# 1 - tree-based neighborhood |
| 261 | +# 2 - sequential locality 64K |
| 262 | +# 3 - random 4 K |
| 263 | +-hwprefetch_oversub 1 |
| 264 | + |
| 265 | +# latency in core cycle to handle page fault (45us) |
| 266 | +# encompass the overhead of stalling threads, deciding memory address, page table walk, maintaining page flags, transfer chunks and orders |
| 267 | +-page_fault_latency 66645 |
| 268 | + |
| 269 | +# enabling accurate simulation for stalling warps and serializing accesses for page fault handling (default 0) |
| 270 | +-enable_accurate_simulation 0 |
| 271 | + |
| 272 | +# Enable direct CPU-memory access from GPU |
| 273 | +# 0 - disable |
| 274 | +# 1 - adaptive |
| 275 | +# 2 - always |
| 276 | +# 3 - after oversubscription |
| 277 | +-enable_dma 0 |
| 278 | + |
| 279 | +# Access counter threshold for migrating the page from cpu to gpu |
| 280 | +-migrate_threshold 8 |
| 281 | + |
| 282 | +# Oversubscription Multiplicative Penalty Factor for Adaptive DMA |
| 283 | +-multiply_dma_penalty 2 |
| 284 | + |
| 285 | +# enabling access pattern detection, policy engine, and adaptive memory management |
| 286 | +-enable_smart_runtime 0 |
| 287 | + |
| 288 | +# enabling skip cycles when all warps stall and wait for page fault come back |
| 289 | +-skip_cycles_enable 1 |
0 commit comments