gpgpu-sim · aamodt · Oct 18, 2021 · Oct 26, 2020 · Oct 26, 2020 · Nov 9, 2020
diff --git a/CHANGES b/CHANGES
@@ -1,4 +1,16 @@
 LOG:
+Version 4.1.0 versus 4.0.0
+-Features:
+1- Supporting L1 write-allocate with sub-sector writing policy as in Volta+ hardware, and changing the Volta+ cards config to make L1 write-allocate with write-through
+2- Making the L1 adaptive cache policy to be configurable 
+3- Adding Ampere RTX 3060 config files
+-Bugs:
+1- Fixing L1 bank hash function bug
+2- Fixing L1 read hit counters in gpgpu-sim to match nvprof, to achieve more accurate L1 correlation with the HW
+3- Fixing bugs in lazy write handling, thanks to Gwendolyn Voskuilen from Sandia labs for this fix
+4- Fixing the backend pipeline for sub_core model 
+5- Fixing Memory stomp bug at the shader_config
+6- Some code refactoring:
 Version 4.0.0 (development branch) versus 3.2.3
 -Front-End:
 1- Support .nc cache modifier and __ldg function to access the read-only L1D cache

diff --git a/README.md b/README.md
@@ -11,22 +11,26 @@ This version of GPGPU-Sim has been tested with a subset of CUDA version 4.2,
 Please see the copyright notice in the file COPYRIGHT distributed with this
 release in the same directory as this file.
 
+GPGPU-Sim 4.0 is compatible with Accel-Sim simulation framework. With the support 
+of Accel-Sim, GPGPU-Sim 4.0 can run NVIDIA SASS traces (trace-based simulation) 
+generated by NVIDIA's dynamic binary instrumentation tool (NVBit). For more information 
+about Accel-Sim, see [https://accel-sim.github.io/](https://accel-sim.github.io/)
+
 If you use GPGPU-Sim 4.0 in your research, please cite:
 
 Mahmoud Khairy, Zhesheng Shen, Tor M. Aamodt, Timothy G Rogers.
 Accel-Sim: An Extensible Simulation Framework for Validated GPU Modeling.
 In proceedings of the 47th IEEE/ACM International Symposium on Computer Architecture (ISCA),
 May 29 - June 3, 2020.
 
-If you use CuDNN or PyTorch support, checkpointing or our new debugging tool for functional 
+If you use CuDNN or PyTorch support (execution-driven simulation), checkpointing or our new debugging tool for functional 
 simulation errors in GPGPU-Sim for your research, please cite:
 
 Jonathan Lew, Deval Shah, Suchita Pati, Shaylin Cattell, Mengchi Zhang, Amruth Sandhupatla, 
 Christopher Ng, Negar Goli, Matthew D. Sinclair, Timothy G. Rogers, Tor M. Aamodt
 Analyzing Machine Learning Workloads Using a Detailed GPU Simulator, arXiv:1811.08933,
 https://arxiv.org/abs/1811.08933
 
-
 If you use the Tensor Core model in GPGPU-Sim or GPGPU-Sim's CUTLASS Library 
 for your research please cite:
 
@@ -261,6 +265,7 @@ To clean the docs run
 The documentation resides at doc/doxygen/html.
 
 To run Pytorch applications with the simulator, install the modified Pytorch library as well by following instructions [here](https://github.com/gpgpu-sim/pytorch-gpgpu-sim).
+
 ## Step 3: Run
 
 Before we run, we need to make sure the application's executable file is dynamically linked to CUDA runtime library. This can be done during compilation of your program by introducing the nvcc flag "--cudart shared" in makefile (quotes should be excluded).

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -1,8 +1,3 @@
-# This config models the Turing RTX 2060
-# For more info about turing architecture:
-# 1- https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/technologies/turing-architecture/NVIDIA-Turing-Architecture-Whitepaper.pdf
-# 2- "RTX on—The NVIDIA Turing GPU", IEEE MICRO 2020
-
 # functional simulator specification
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
@@ -14,6 +9,7 @@
 -gpgpu_runtime_sync_depth_limit 2
 -gpgpu_runtime_pending_launch_count_limit 2048
 -gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
 
 # Compute Capability
 -gpgpu_compute_capability_major 7
@@ -27,91 +23,93 @@
 -gpgpu_n_clusters 30
 -gpgpu_n_cores_per_cluster 1
 -gpgpu_n_mem 12
--gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_n_sub_partition_per_mchannel 2
 
-# volta clock domains
+# clock domains
 #-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
--gpgpu_clock_domains 1365.0:1365.0:1365.0:3500.0
-# boost mode
-# -gpgpu_clock_domains 1680.0:1680.0:1680.0:3500.0
+-gpgpu_clock_domains 1365:1365:1365:3500.5
 
 # shader core pipeline config
 -gpgpu_shader_registers 65536
 -gpgpu_registers_per_block 65536
 -gpgpu_occupancy_sm_number 75
 
-# This implies a maximum of 32 warps/SM
--gpgpu_shader_core_pipeline 1024:32 
--gpgpu_shader_cta 32
+-gpgpu_shader_core_pipeline 1024:32
+-gpgpu_shader_cta 16
 -gpgpu_simd_model 1
 
 # Pipeline widths and number of FUs
 # ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
-## Turing has 4 SP SIMD units, 4 INT units, 4 SFU units, 8 Tensor core units
-## We need to scale the number of pipeline registers to be equal to the number of SP units
--gpgpu_pipeline_widths 4,0,4,4,4,4,0,4,4,4,8,4,4
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
 -gpgpu_num_sp_units 4
 -gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
 -gpgpu_num_int_units 4
 -gpgpu_tensor_core_avail 1
 -gpgpu_num_tensor_core_units 4
 
 # Instruction latencies and initiation intervals
 # "ADD,MAX,MUL,MAD,DIV"
 # All Div operations are executed on SFU unit
--ptx_opcode_latency_int 4,13,4,5,145,32
--ptx_opcode_initiation_int 2,2,2,2,8,4
--ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_latency_int 4,4,4,4,21
+-ptx_opcode_initiation_int 2,2,2,2,2
+-ptx_opcode_latency_fp 4,4,4,4,39
 -ptx_opcode_initiation_fp 2,2,2,2,4
--ptx_opcode_latency_dp 8,19,8,8,330
--ptx_opcode_initiation_dp 4,4,4,4,130
--ptx_opcode_latency_sfu 100
+-ptx_opcode_latency_dp 64,64,64,64,330
+-ptx_opcode_initiation_dp 64,64,64,64,130
+-ptx_opcode_latency_sfu 21
 -ptx_opcode_initiation_sfu 8
 -ptx_opcode_latency_tesnor 64
 -ptx_opcode_initiation_tensor 64
 
-# Turing has four schedulers per core
--gpgpu_num_sched_per_core 4
-# Greedy then oldest scheduler
--gpgpu_scheduler gto
-## In Turing, a warp scheduler can issue 1 inst per cycle
--gpgpu_max_insn_issue_per_warp 1
--gpgpu_dual_issue_diff_exec_units 1
-
-# shared memory bankconflict detection 
--gpgpu_shmem_num_banks 32
--gpgpu_shmem_limited_broadcast 0
--gpgpu_shmem_warp_parts 1
--gpgpu_coalesce_arch 75
-
-# Trung has sub core model, in which each scheduler has its own register file and EUs
+# sub core model: in which each scheduler has its own register file and EUs
 # i.e. schedulers are isolated
 -gpgpu_sub_core_model 1
 # disable specialized operand collectors and use generic operand collectors instead
 -gpgpu_enable_specialized_operand_collector 0
 -gpgpu_operand_collector_num_units_gen 8
 -gpgpu_operand_collector_num_in_ports_gen 8
 -gpgpu_operand_collector_num_out_ports_gen 8
-# turing has 8 banks dual-port, 4 schedulers, two banks per scheduler
-# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
--gpgpu_num_reg_banks 16
+# register banks
+-gpgpu_num_reg_banks 8
 -gpgpu_reg_file_port_throughput 2
 
+# warp scheduling
+-gpgpu_num_sched_per_core 4
+-gpgpu_scheduler lrr
+# a warp scheduler issue mode
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
 # <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
--gpgpu_adaptive_cache_config 0
+# In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 32,64
+-gpgpu_unified_l1d_size 96
+# L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:512,L:L:s:N:L,A:256:8,16:0,32
--gpgpu_shmem_size 65536
--gpgpu_shmem_sizeDefault 65536
--gpgpu_shmem_per_block 65536
+-gpgpu_cache:dl1 S:4:128:64,L:T:m:L:L,A:256:32,16:0,32
+-gpgpu_l1_latency 32
 -gpgpu_gmem_skip_L1D 0
--gpgpu_n_cluster_ejection_buffer_size 32
--gpgpu_l1_latency 20
--gpgpu_smem_latency 20
 -gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+-gpgpu_l1_cache_write_ratio 25
 
-# 64 sets, each 128 bytes 16-way for each memory sub partition (128 KB per memory sub partition). This gives us 3MB L2 cache
+# shared memory  configuration
+-gpgpu_shmem_size 65536
+-gpgpu_shmem_sizeDefault 65536
+-gpgpu_shmem_per_block 49152
+-gpgpu_smem_latency 30
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 75
+
+# L2 cache
 -gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
@@ -122,44 +120,41 @@
 -gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
 -gpgpu_inst_fetch_throughput 4
 # 128 KB Tex
-# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+# Note, TEX is deprected since Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
 -gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
 # 64 KB Const
 -gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
 -gpgpu_perfect_inst_const_cache 1
 
 # interconnection
-#-network_mode 1 
-#-inter_config_file config_turing_islip.icnt
 # use built-in local xbar
 -network_mode 2
 -icnt_in_buffer_limit 512
 -icnt_out_buffer_limit 512
 -icnt_subnets 2
--icnt_arbiter_algo 1
 -icnt_flit_size 40
+-icnt_arbiter_algo 1
 
 # memory partition latency config 
--gpgpu_l2_rop_latency 160
--dram_latency 100
+-gpgpu_l2_rop_latency 194
+-dram_latency 96
 
-# dram model config
+# dram sched config
 -gpgpu_dram_scheduler 1
 -gpgpu_frfcfs_dram_sched_queue_size 64
 -gpgpu_dram_return_queue_size 192
 
-# Turing has GDDR6
-# http://monitorinsider.com/GDDR6.html
+# dram model config
 -gpgpu_n_mem_per_ctrlr 1
 -gpgpu_dram_buswidth 2
 -gpgpu_dram_burst_length 16
 -dram_data_command_freq_ratio 4
 -gpgpu_mem_address_mask 1
 -gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS
 
-# Use the same GDDR5 timing, scaled to 3500MHZ
--gpgpu_dram_timing_opt "nbk=16:CCD=4:RRD=10:RCD=20:RAS=50:RP=20:RC=62:
-                        CL=20:WL=8:CDLR=9:WR=20:nbkgrp=4:CCDL=4:RTPL=4"
+# Mem timing 
+-gpgpu_dram_timing_opt nbk=16:CCD=4:RRD=12:RCD=24:RAS=55:RP=24:RC=78:CL=24:WL=8:CDLR=10:WR=24:nbkgrp=4:CCDL=6:RTPL=4
+-dram_dual_bus_interface 0
 
 # select lower bits for bnkgrp to increase bnkgrp parallelism
 -dram_bnk_indexing_policy 0
@@ -174,7 +169,7 @@
 -enable_ptx_file_line_stats 1
 -visualizer_enabled 0
 
-# power model configs, disable it untill we create a real energy model for Volta
+# power model configs, disable it untill we create a real energy model
 -power_simulation_enabled 0
 
 # tracing functionality

diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -94,12 +94,12 @@
 -gpgpu_shmem_num_banks 32
 -gpgpu_shmem_limited_broadcast 0
 -gpgpu_shmem_warp_parts 1
--gpgpu_coalesce_arch 60
+-gpgpu_coalesce_arch 70
 
 # Volta has four schedulers per core
 -gpgpu_num_sched_per_core 4
 # Greedy then oldest scheduler
--gpgpu_scheduler gto
+-gpgpu_scheduler lrr
 ## In Volta, a warp scheduler can issue 1 inst per cycle
 -gpgpu_max_insn_issue_per_warp 1
 -gpgpu_dual_issue_diff_exec_units 1
@@ -113,17 +113,21 @@
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 # disable this mode in case of multi kernels/apps execution
 -gpgpu_adaptive_cache_config 1
-# Volta unified cache has four banks
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:s:N:L,A:256:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
 -gpgpu_shmem_size 98304
 -gpgpu_shmem_sizeDefault 98304
 -gpgpu_shmem_per_block 65536
--gpgpu_gmem_skip_L1D 0
--gpgpu_n_cluster_ejection_buffer_size 32
--gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
--gpgpu_flush_l1_cache 1
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
@@ -201,5 +205,4 @@
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
-#-trace_sampling_core 0
-
+#-trace_sampling_core 0
diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
@@ -100,7 +100,7 @@
 # Volta has four schedulers per core
 -gpgpu_num_sched_per_core 4
 # Greedy then oldest scheduler
--gpgpu_scheduler gto
+-gpgpu_scheduler lrr
 ## In Volta, a warp scheduler can issue 1 inst per cycle
 -gpgpu_max_insn_issue_per_warp 1
 -gpgpu_dual_issue_diff_exec_units 1
@@ -114,17 +114,21 @@
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 # disable this mode in case of multi kernels/apps execution
 -gpgpu_adaptive_cache_config 1
-# Volta unified cache has four banks
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:s:N:L,A:256:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_l1_latency 20
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
 -gpgpu_shmem_size 98304
 -gpgpu_shmem_sizeDefault 98304
 -gpgpu_shmem_per_block 65536
--gpgpu_gmem_skip_L1D 0
--gpgpu_n_cluster_ejection_buffer_size 32
--gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
--gpgpu_flush_l1_cache 1
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 4.5MB L2 cache
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32