-
Notifications
You must be signed in to change notification settings - Fork 1
/
gpgpusim.config
133 lines (108 loc) · 4.35 KB
/
gpgpusim.config
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# functional simulator specification
-gpgpu_ptx_instruction_classification 0
-gpgpu_ptx_sim_mode 0
-gpgpu_ptx_force_max_capability 20
# SASS execution (only supported with CUDA >= 4.0)
-gpgpu_ptx_convert_to_ptxplus 0
-gpgpu_ptx_save_converted_ptxplus 0
# high level architecture configuration
-gpgpu_n_clusters 15
-gpgpu_n_cores_per_cluster 1
-gpgpu_n_mem 6
-gpgpu_n_sub_partition_per_mchannel 2
# Fermi clock domains
#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
# In Fermi, each pipeline has 16 execution units, so the Core clock needs to be divided
# by 2. (GPGPU-Sim simulates a warp (32 threads) in a single cycle). 1400/2 = 700
-gpgpu_clock_domains 700.0:700.0:700.0:924.0
# shader core pipeline config
-gpgpu_shader_registers 32768
# This implies a maximum of 48 warps/SM
-gpgpu_shader_core_pipeline 1536:32
-gpgpu_shader_cta 8
-gpgpu_simd_model 1
# Pipeline widths and number of FUs
# ID_OC_SP,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_SFU,OC_EX_MEM,EX_WB
-gpgpu_pipeline_widths 2,1,1,2,1,1,2
-gpgpu_num_sp_units 2
-gpgpu_num_sfu_units 1
# Instruction latencies and initiation intervals
# "ADD,MAX,MUL,MAD,DIV"
-ptx_opcode_latency_int 4,13,4,5,145
-ptx_opcode_initiation_int 1,2,2,1,8
-ptx_opcode_latency_fp 4,13,4,5,39
-ptx_opcode_initiation_fp 1,2,1,1,4
-ptx_opcode_latency_dp 8,19,8,8,330
-ptx_opcode_initiation_dp 8,16,8,8,130
# In Fermi, the cache and shared memory can be configured to 16kb:48kb(default) or 48kb:16kb
# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
# ** Optional parameter - Required when mshr_type==Texture Fifo
# Note: Hashing set index function (H) only applies to a set size of 32 or 64.
-gpgpu_cache:dl1 32:128:4,L:L:m:N:H,A:32:8,8
-gpgpu_shmem_size 49152
# The alternative configuration for fermi in case cudaFuncCachePreferL1 is selected
#-gpgpu_cache:dl1 64:128:6,L:L:m:N:H,A:32:8,8
#-gpgpu_shmem_size 16384
# 64 sets, each 128 bytes 8-way for each memory sub partition. This gives 786KB L2 cache
-gpgpu_cache:dl2 64:128:8,L:B:m:W:L,A:32:4,4:0,32
-gpgpu_cache:dl2_texture_only 0
-gpgpu_cache:il1 4:128:4,L:R:f:N:L,A:2:32,4
-gpgpu_tex_cache:l1 4:128:24,L:R:m:N:L,F:128:4,128:2
-gpgpu_const_cache:l1 64:64:2,L:R:f:N:L,A:2:32,4
# enable operand collector
-gpgpu_operand_collector_num_units_sp 6
-gpgpu_operand_collector_num_units_sfu 8
-gpgpu_operand_collector_num_in_ports_sp 2
-gpgpu_operand_collector_num_out_ports_sp 2
-gpgpu_num_reg_banks 16
# shared memory bankconflict detection
-gpgpu_shmem_num_banks 32
-gpgpu_shmem_limited_broadcast 0
-gpgpu_shmem_warp_parts 1
-gpgpu_max_insn_issue_per_warp 1
# interconnection
-network_mode 1
-inter_config_file config_fermi_islip.icnt
# memory partition latency config
-rop_latency 120
-dram_latency 100
# dram model config
-gpgpu_dram_scheduler 1
# The DRAM return queue and the scheduler queue together should provide buffer
# to sustain the memory level parallelism to tolerate DRAM latency
# To allow 100% DRAM utility, there should at least be enough buffer to sustain
# the minimum DRAM latency (100 core cycles). I.e.
# Total buffer space required = 100 x 924MHz / 700MHz = 132
-gpgpu_frfcfs_dram_sched_queue_size 16
-gpgpu_dram_return_queue_size 116
# for Fermi, bus width is 384bits, this is 8 bytes (4 bytes at each DRAM chip) per memory partition
-gpgpu_n_mem_per_ctrlr 2
-gpgpu_dram_buswidth 4
-gpgpu_dram_burst_length 8
-dram_data_command_freq_ratio 4 # GDDR5 is QDR
-gpgpu_mem_address_mask 1
-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.BBBCCCCB.CCSSSSSS
# GDDR5 timing from hynix H5GQ1H24AFR
# to disable bank groups, set nbkgrp to 1 and tCCDL and tRTPL to 0
-gpgpu_dram_timing_opt "nbk=16:CCD=2:RRD=6:RCD=12:RAS=28:RP=12:RC=40:
CL=12:WL=4:CDLR=5:WR=12:nbkgrp=4:CCDL=3:RTPL=2"
# Fermi has two schedulers per core
-gpgpu_num_sched_per_core 2
# Two Level Scheduler with active and pending pools
#-gpgpu_scheduler two_level_active:6:0:1
# Loose round robbin scheduler
#-gpgpu_scheduler lrr
# Greedy then oldest scheduler
-gpgpu_scheduler gto
# stat collection
-gpgpu_memlatency_stat 14
-gpgpu_runtime_stat 500
-enable_ptx_file_line_stats 1
-visualizer_enabled 0
# power model configs
-power_simulation_enabled 1
-gpuwattch_xml_file gpuwattch_gtx480.xml
# tracing functionality
#-trace_enabled 1
#-trace_components WARP_SCHEDULER,SCOREBOARD
#-trace_sampling_core 0